In [5]:
from __future__ import (absolute_import, division,
                        print_function, unicode_literals)

import warnings
warnings.simplefilter('ignore')

# general purpose packages
import pandas as pd
import numpy as np
import os
import json
import time
import re
import csv
import subprocess
import sys

import scipy.stats as stats
import statsmodels.stats as smstats
from statsmodels.stats.multitest import multipletests

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import sklearn

import umap

from multiprocessing import Process, Manager, Pool
import multiprocessing
from functools import partial

from collections import Counter

import seaborn as sns; sns.set()

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
matplotlib.rcParams['backend'] = "Qt5Agg"
import matplotlib.ticker as ticker
from matplotlib.ticker import FuncFormatter

from IPython.display import display, Image

from adjustText import adjust_text
import builtins
%matplotlib inline

# for working with sam/bam files
import HTSeq

# for working with yaml files
import ruamel.yaml

import itertools
In [27]:
# paths to subdirectories
subdirs = {}

subdirs['main_project_dir'] = '/scicore/home/zavolan/GROUP/Primer_Probe_design/'
subdirs['human_annotation_dir'] = '/scicore/home/zavolan/GROUP/Genomes/homo_sapiens/'
subdirs['swine_annotation_dir'] = '/scicore/home/zavolan/GROUP/Genomes/swine/'

subdirs['shared_project_dir'] = subdirs['main_project_dir']
subdirs['PRRSV_dir'] = subdirs['shared_project_dir']+'PRRSV/'
subdirs['PRRSV_github_dir'] = subdirs['PRRSV_dir']+'github/'
subdirs['PRRSV_materials_dir'] = subdirs['PRRSV_dir']+'materials/'
subdirs['PRRSV_reference_genomes_dir'] = subdirs['PRRSV_dir']+'reference_genomes/'

# technicals
subdirs['temp_dir'] = subdirs['shared_project_dir']+'temp/'
subdirs['figures_dir'] = subdirs['shared_project_dir']+'figures/'
subdirs['tables_dir'] = subdirs['shared_project_dir']+'tables/'

# paths to files
file_paths = {}
### genome annotation files
file_paths['human_genome_file'] = subdirs['human_annotation_dir']+'GRCh38.primary_assembly.genome.fa'
file_paths['human_genome_fai_file'] = subdirs['human_annotation_dir']+'GRCh38.primary_assembly.genome.fa.fai'
file_paths['human_annotation_file'] = subdirs['human_annotation_dir']+'hg38_v42/gencode.v42.annotation.gtf'
file_paths['human_RNAcentral_annotation_file'] = subdirs['human_annotation_dir']+'hg38_v42/homo_sapiens.GRCh38.gff3.gz'
file_paths['human_enriched_annotation_file'] = subdirs['human_annotation_dir']+'hg38_v42/enriched.gencode.v42.annotation.gtf'
file_paths['human_chrom_sizes'] = subdirs['human_annotation_dir']+'hg38.chrom.sizes'

os.system('mkdir -p '+' '.join(list(subdirs.values()))) # create all subdirs
Out[27]:
0

PRRSV - track hub¶

Initial github configuration¶

In [37]:
# cd to subdirs['PRRSV_github_dir']
# git init
# then create github repo on the github website
# then create test file


# then, for every change to put to the repo online:
# git add -A
# git commit -m "message"
# git push -u origin main

reference genomes for PRRSV-type 1 (Lelystad) and PRRSV-type 2 (VR-2332)¶

In [32]:
command = 'esearch -db nucleotide -query "'+'M96262'+'" | efetch -format fasta > '+subdirs['PRRSV_reference_genomes_dir']+'M96262.Lelystad.PRRSV_type1.fasta'+'; '
command = command+'esearch -db nucleotide -query "'+'U87392'+'" | efetch -format fasta > '+subdirs['PRRSV_reference_genomes_dir']+'U87392.VR2332.PRRSV_type2.fasta'+'; '
command = command+'cat '+subdirs['PRRSV_reference_genomes_dir']+'M96262.Lelystad.PRRSV_type1.fasta '+subdirs['PRRSV_reference_genomes_dir']+'U87392.VR2332.PRRSV_type2.fasta > '+subdirs['PRRSV_reference_genomes_dir']+'PRRSV_reference_genomes.fasta'
command
Out[32]:
'esearch -db nucleotide -query "M96262" | efetch -format fasta > /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/reference_genomes/M96262.Lelystad.PRRSV_type1.fasta; esearch -db nucleotide -query "U87392" | efetch -format fasta > /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/reference_genomes/U87392.VR2332.PRRSV_type2.fasta; cat /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/reference_genomes/M96262.Lelystad.PRRSV_type1.fasta /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/reference_genomes/U87392.VR2332.PRRSV_type2.fasta > /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/reference_genomes/PRRSV_reference_genomes.fasta'
In [38]:
subdirs['PRRSV_github_dir']
Out[38]:
'/scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/github/'
In [ ]:
command = 'mkdir -p '+ourdir
out = subprocess.check_output(command, shell=True)
In [ ]:
myHub/ - directory to organize your files on this hub
     hub.txt – primary reference text file to define the hub, refers to:
     genomes.txt – definitions for each genome assembly on this hub
          newOrg1/ - directory of files for this specific genome assembly
               newOrg1.2bit – ‘2bit’ file constructed from your fasta sequence
               description.html – information about this assembly for users
               trackDb.txt – definitions for tracks on this genome assembly
               groups.txt – definitions for track groups on this assembly
               bigWig and bigBed files – data for tracks on this assembly
               external track hub data tracks can be displayed on this assembly
In [ ]:
# define hub.txt
"""hub PRRSV_Hub
shortLabel PRRSV
longLabel PRRSV
useOneFile on
email magmir71@gmail.com
descriptionUrl https://raw.githubusercontent.com/zavolanlab/primer_probe_design/main/track_descriptions/Track_hub_description.html

genome hg38
defaultPos chr3:108042402-108047903
organism Homo Sapiens
In [ ]:
f = open(trackhub_chunks_dir+'hub_total.'+organism+'.txt', "w")

command = ''

bigbed_composite = """track PAS_catalogue
compositeTrack on
allButtonPair on
visibility squish
shortLabel PAS catalogue
longLabel 1. PAS - full catalogue
type bigBed 6 .
spectrum on
scoreMax 100
scoreMin 20
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_catalogue.html

"""
f.write(bigbed_composite)
In [33]:
subdirs['PRRSV_github_dir']
Out[33]:
'/scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/github/'
In [ ]:
 
In [ ]:
 
In [ ]:
 

download canadian WGS of PPRSV-type2 from GENBANK¶

In [19]:
paths_tsv = pd.read_csv(subdirs['PRRSV_materials_dir']+'WGS_PMID32817228/WGS_Canada.tsv',delimiter="\t",index_col=None,header=0)
paths_tsv['sample'] = paths_tsv['GenBank']+'.'+paths_tsv['sample_type']+'.'+paths_tsv['number_of_pooled_animals'].str.replace('.','')
paths_tsv['target_path'] = subdirs['PRRSV_materials_dir']+'WGS_PMID32817228/'+paths_tsv['sample']+'.fasta'
In [26]:
# try downloading everything to single fasta file

target_path = subdirs['PRRSV_materials_dir']+'WGS_PMID32817228/whole_genome_seqs.PRRSV_2.fasta'

command = ''
for index,row in paths_tsv.iterrows():
    command = command+'esearch -db nucleotide -query "'+row['GenBank']+'" | efetch -format fasta '+('>' if index==0 else '>>')+' '+target_path+'; '    
command
Out[26]:
'esearch -db nucleotide -query "MN865482" | efetch -format fasta > /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865566" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865567" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865483" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865484" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865485" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865486" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865487" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865488" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865568" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865569" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865489" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865490" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865491" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865492" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865493" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865494" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865495" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865496" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865570" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865571" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865497" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865498" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865499" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865500" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865501" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865502" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865503" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865504" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865505" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865506" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865507" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865508" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865509" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865510" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865511" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865512" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865513" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865514" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865515" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865516" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865517" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865518" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865519" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865520" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865521" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865522" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865523" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865524" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865525" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865526" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865527" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865528" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865529" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865530" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865531" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865532" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865533" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865534" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865535" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865536" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865537" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865538" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865539" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865540" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865541" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865542" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865543" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865544" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865545" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865546" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865547" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865548" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865549" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865550" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865551" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865552" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865553" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865554" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865555" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865556" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865557" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865558" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865559" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865560" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865561" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865562" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865572" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865573" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865563" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865564" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865565" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; '
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 

human¶

In [366]:
organism = 'human'

num_of_samples = 813
organism_label = organism if organism!='celegans' else 'worm'

df_name = 'v3.'+organism_label
v3_pas_dir = subdirs['temp_dir']+df_name+'.with_segment_class.tsv'

SCINPAS = pd.read_csv(v3_pas_dir,delimiter="\t",index_col=None,header=0)

ourdir = os.path.dirname(file_paths[organism+'_output_PAS_bed_gz'])
command = 'mkdir -p '+ourdir
out = subprocess.check_output(command, shell=True)
In [367]:
tissues = ['prostate', 'skin', 'penis', 'intestine', 'heart', 'kidney', 'breast',
       'lung', 'uterus', 'nose', 'pancreas', 'trachea', 'bone', 'eye', 'liver','ureter', 'brain','bloodImmune']
In [368]:
###
# make bed file
###

mp_cols = [col for col in SCINPAS.columns if 'MP' in col]

v3_point = get_points_others(SCINPAS, 'v3')
results = get_points(SCINPAS, mp_cols)
results.append(v3_point)

results_df = pd.DataFrame(results, columns=['source', 'input_thr', 'num_pas', 'motif_percentage'])
results_df['num_pas'] = results_df['num_pas'].astype('int')
results_df['motif_percentage'] = results_df['motif_percentage'].astype('float')
results_df = results_df.sort_values('motif_percentage',ascending=True).reset_index(drop=True)
results_df['short_MP'] = np.round(results_df['motif_percentage'],0).astype('int')

mp_to_report_df = results_df.drop_duplicates('short_MP').reset_index(drop=True)
In [369]:
mp_to_report_df
Out[369]:
source input_thr num_pas motif_percentage short_MP
0 v3 full 18432135 20.056922 20
1 v3 20_MP 16605700 21.215884 21
2 v3 25_MP 13068497 23.315596 23
3 v3 30_MP 8775588 26.246629 26
4 v3 35_MP 5478736 29.275676 29
5 v3 40_MP 3348203 32.333494 32
6 v3 45_MP 2004409 35.653801 36
7 v3 50_MP 1296263 38.872513 39
8 v3 60_MP 604072 46.730688 47
9 v3 65_MP 426465 51.415239 51
10 v3 70_MP 312120 56.573433 57
11 v3 75_MP 236099 61.822371 62
12 v3 80_MP 173253 68.589866 69
13 v3 85_MP 121869 77.469250 77
14 v3 90_MP 88284 86.715600 87
15 v3 95_MP 64777 97.511462 98
In [370]:
SCINPAS['full'] = 1
SCINPAS['alt_score'] = SCINPAS[list(mp_to_report_df['input_thr'])].mul(list(mp_to_report_df['short_MP'])).max(1) # stringency

SCINPAS['alt_id'] = SCINPAS.apply(lambda x:':'.join((x['id'].split(':')[:3])),1)

# SCINPAS['%_of_samples_with_support'] = np.round(SCINPAS['supp']/num_of_samples*100,3)
SCINPAS['%_of_tissues_with_support'] = np.round(100*np.count_nonzero(SCINPAS[tissues],1)/len(tissues),1) # calculate percentage of tissues instead of samples

SCINPAS['num_of_protocols'] = 1
SCINPAS['avg_expression'] = np.round(SCINPAS[tissues].mean(1),6) # mean of means

tmp = SCINPAS[['segment_class']].drop_duplicates().reset_index(drop=True)
PAS_cat_dict = {'TE':'TE','I':'IN','E':'EX','A':'AL','D_I':'DI','U_I':'UI','N':'NA'}
tmp['PAS_cat'] = tmp['segment_class'].map(PAS_cat_dict)

if 'PAS_cat' in list(SCINPAS.columns):
    SCINPAS = SCINPAS.drop('PAS_cat',1)

SCINPAS = pd.merge(SCINPAS,tmp,how='left',on='segment_class')

motif_cols = ['AAUAAA', 'AUUAAA','UAUAAA', 'AAGAAA', 'AGUAAA', 'AAUACA', 'AAUAUA', 'CAUAAA', 'AAUGAA','GAUAAA', 'ACUAAA', 'AAUAGA']

def fill_motif(x):
    res = ';'.join([elem for elem in motif_cols if x[elem]==1])
    if res=='':
        res = 'NaN'
    return res

tmp = SCINPAS[motif_cols].drop_duplicates().reset_index(drop=True)
tmp['motif'] = tmp.apply(lambda x: fill_motif(x), 1)

if 'motif' in list(SCINPAS.columns):
    SCINPAS = SCINPAS.drop('motif',1)

SCINPAS = pd.merge(SCINPAS,tmp,how='left',on=motif_cols)

tmp = SCINPAS[['seqid']].drop_duplicates().reset_index(drop=True)
tmp['chr'] = tmp['seqid'].str.replace('chr','').str.replace('X','23').str.replace('Y','24').astype('int')

if 'chr' in list(SCINPAS.columns):
    SCINPAS = SCINPAS.drop('chr',1)

SCINPAS = pd.merge(SCINPAS,tmp,how='left',on='seqid')

SCINPAS = SCINPAS.sort_values(['chr','start','end','strand']).reset_index(drop=True)

SCINPAS[['seqid','start','end','alt_id','avg_expression',
         'strand','%_of_tissues_with_support',
         'num_of_protocols','alt_score','PAS_cat','motif']].to_csv(file_paths[organism+'_output_PAS_bed_gz'],
                                                                       sep=str('\t'),header=False,index=None,
                                                                       quoting=csv.QUOTE_NONE,compression='gzip')
In [371]:
Counter(SCINPAS['PAS_cat'])
Out[371]:
Counter({'DI': 3920458,
         'UI': 2454745,
         'TE': 564914,
         'IN': 10507671,
         'EX': 348266,
         'AL': 635988,
         'NA': 93})
In [373]:
###
# correct tsv file
###

input_gtf = pd.read_csv(file_paths[organism+'_annotation_file'], delimiter = '\t', header = None, skiprows=5)
input_gtf[[0,1,2,5,6,7]] = input_gtf[[0,1,2,5,6,7]].astype('category') # to decrease size of the DF
input_gtf[[3,4]] = input_gtf[[3,4]].astype('int')
genes = input_gtf.loc[input_gtf[2]=='gene'].reset_index(drop=True)
genes['gene_id'] = genes[8].str.split('gene_id "',expand=True)[1].str.split('";',expand=True)[0]
genes['gene_name'] = genes[8].str.split('gene_name "',expand=True)[1].str.split('";',expand=True)[0]

SCINPAS_tsv = SCINPAS.copy()
SCINPAS_tsv = pd.merge(SCINPAS_tsv,genes[['gene_id','gene_name']].drop_duplicates().reset_index(drop=True).rename(columns = {'gene_id':'reassigned_g'}),how='left',on='reassigned_g')
SCINPAS_tsv['rep'] = SCINPAS_tsv.apply(lambda x:int(x['alt_id'].split(':')[1]),1)
SCINPAS_tsv = SCINPAS_tsv.rename(columns={'seqid':'chrom','start':'chromStart','end':'chromEnd','alt_id':'name','score':'score_original','alt_score':'stringency_level','strand':'strand',
                                          '%_of_tissues_with_support':'perc_tissues','num_of_protocols':'nr_prots','PAS_cat':'annotation','gene_id':'gene_id_original','reassigned_g':'gene_id','motif':'repSite_signals'})

SCINPAS_tsv = SCINPAS_tsv[['chrom','chromStart','chromEnd','name','avg_expression','strand','rep',
             'perc_tissues', 'nr_prots', 'annotation', 
             'gene_name', 'gene_id', 'repSite_signals','stringency_level']+tissues]

# rename tissues
tissue_rename_dict = {}
tissue_rename_dict['trachea']= 'tracheal epithelium'
tissue_rename_dict['nose']= 'nasal mucosa'
tissue_rename_dict['kidney']= 'kidney parenchyma'
tissue_rename_dict['intestine'] = 'intestine'
tissue_rename_dict['bone'] = 'intervertebral disc'
tissue_rename_dict['penis'] = 'corpus cavernosum'
SCINPAS_tsv = SCINPAS_tsv.rename(columns=tissue_rename_dict)

SCINPAS_tsv.to_csv(file_paths[organism+'_output_PAS_tsv_gz'],sep=str('\t'),header=True,index=None,quoting=csv.QUOTE_NONE,compression='gzip')
In [374]:
file_paths[organism+'_output_PAS_bed_gz'],file_paths[organism+'_output_PAS_tsv_gz']
Out[374]:
('/scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/atlas.clusters.3.0.GRCh38.GENCODE_42.bed.gz',
 '/scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/atlas.clusters.3.0.GRCh38.GENCODE_42.tsv.gz')

mouse¶

In [177]:
organism = 'mouse'

num_of_samples = 188
organism_label = organism if organism!='celegans' else 'worm'

df_name = 'v3.'+organism_label
v3_pas_dir = subdirs['temp_dir']+df_name+'.with_segment_class.tsv'

SCINPAS = pd.read_csv(v3_pas_dir,delimiter="\t",index_col=None,header=0)

ourdir = os.path.dirname(file_paths[organism+'_output_PAS_bed_gz'])
command = 'mkdir -p '+ourdir
out = subprocess.check_output(command, shell=True)
In [178]:
tissues = ['Tongue', 'Bladder', 'Kidney', 'unknown', 'Spleen', 'Fat', 'Marrow',
       'Lung', 'Aorta', 'Heart', 'MammaryGland', 'LimbMuscle', 'Liver', 'Skin',
       'Pancreas', 'Thymus', 'LargeIntestine', 'Trachea']
In [179]:
###
# make bed file
###

mp_cols = [col for col in SCINPAS.columns if 'MP' in col]

v3_point = get_points_others(SCINPAS, 'v3')
results = get_points(SCINPAS, mp_cols)
results.append(v3_point)

results_df = pd.DataFrame(results, columns=['source', 'input_thr', 'num_pas', 'motif_percentage'])
results_df['num_pas'] = results_df['num_pas'].astype('int')
results_df['motif_percentage'] = results_df['motif_percentage'].astype('float')
results_df = results_df.sort_values('motif_percentage',ascending=True).reset_index(drop=True)
results_df['short_MP'] = np.round(results_df['motif_percentage'],0).astype('int')
In [180]:
results_df
Out[180]:
source input_thr num_pas motif_percentage short_MP
0 v3 full 1750661 31.755434 32
1 v3 10_MP 1750231 31.762379 32
2 v3 15_MP 1749436 31.774240 32
3 v3 20_MP 1747582 31.798565 32
4 v3 25_MP 1742464 31.857588 32
5 v3 30_MP 1720494 32.077066 32
6 v3 35_MP 1659463 32.592953 33
7 v3 40_MP 1509646 33.636826 34
8 v3 45_MP 1159635 35.511950 36
9 v3 50_MP 812441 38.166464 38
10 v3 60_MP 374502 46.313504 46
11 v3 65_MP 255694 51.276135 51
12 v3 70_MP 178719 56.678921 57
13 v3 75_MP 126435 62.408352 62
14 v3 80_MP 88566 68.975679 69
15 v3 85_MP 61528 76.241711 76
16 v3 90_MP 43620 84.094452 84
17 v3 95_MP 30230 93.691697 94
In [181]:
mp_to_report_df = results_df.drop_duplicates('short_MP').reset_index(drop=True)
SCINPAS['full'] = 1
SCINPAS['alt_score'] = SCINPAS[list(mp_to_report_df['input_thr'])].mul(list(mp_to_report_df['short_MP'])).max(1) # stringency

SCINPAS['alt_id'] = SCINPAS.apply(lambda x:':'.join((x['id'].split(':')[:3])),1)
# SCINPAS['%_of_samples_with_support'] = np.round(SCINPAS['supp']/num_of_samples*100,3)
SCINPAS['%_of_tissues_with_support'] = np.round(100*np.count_nonzero(SCINPAS[tissues],1)/len(tissues),1) # calculate percentage of tissues instead of samples
SCINPAS['num_of_protocols'] = 1

SCINPAS['avg_expression'] = np.round(SCINPAS[tissues].mean(1),6) # mean of means

tmp = SCINPAS[['segment_class']].drop_duplicates().reset_index(drop=True)
PAS_cat_dict = {'TE':'TE','I':'IN','E':'EX','A':'AL','D_I':'DI','U_I':'UI','N':'NA'}
tmp['PAS_cat'] = tmp['segment_class'].map(PAS_cat_dict)

if 'PAS_cat' in list(SCINPAS.columns):
    SCINPAS = SCINPAS.drop('PAS_cat',1)

SCINPAS = pd.merge(SCINPAS,tmp,how='left',on='segment_class')

motif_cols = ['AAUAAA', 'AUUAAA','UAUAAA', 'AAGAAA', 'AGUAAA', 'AAUACA', 'AAUAUA', 'CAUAAA', 'AAUGAA','GAUAAA', 'ACUAAA', 'AAUAGA']

def fill_motif(x):
    res = ';'.join([elem for elem in motif_cols if x[elem]==1])
    if res=='':
        res = 'NaN'
    return res

tmp = SCINPAS[motif_cols].drop_duplicates().reset_index(drop=True)
tmp['motif'] = tmp.apply(lambda x: fill_motif(x), 1)

if 'motif' in list(SCINPAS.columns):
    SCINPAS = SCINPAS.drop('motif',1)

SCINPAS = pd.merge(SCINPAS,tmp,how='left',on=motif_cols)

tmp = SCINPAS[['seqid']].drop_duplicates().reset_index(drop=True)
tmp['chr'] = tmp['seqid'].str.replace('chr','').str.replace('X','20').str.replace('Y','21').astype('int')

if 'chr' in list(SCINPAS.columns):
    SCINPAS = SCINPAS.drop('chr',1)

SCINPAS = pd.merge(SCINPAS,tmp,how='left',on='seqid')

SCINPAS = SCINPAS.sort_values(['chr','start','end','strand']).reset_index(drop=True)

SCINPAS[['seqid','start','end','alt_id','avg_expression',
         'strand','%_of_tissues_with_support',
         'num_of_protocols','alt_score','PAS_cat','motif']].to_csv(file_paths[organism+'_output_PAS_bed_gz'],
                                                                       sep=str('\t'),header=False,index=None,
                                                                       quoting=csv.QUOTE_NONE,compression='gzip')
In [182]:
len(SCINPAS)
Out[182]:
1750661
In [183]:
Counter(SCINPAS['PAS_cat'])
Out[183]:
Counter({'DI': 451686,
         'UI': 287219,
         'TE': 157405,
         'IN': 693688,
         'EX': 79934,
         'AL': 80615,
         'NA': 114})
In [184]:
###
# correct tsv file
###

input_gtf = pd.read_csv(file_paths[organism+'_annotation_file'], delimiter = '\t', header = None, skiprows=5)
input_gtf[[0,1,2,5,6,7]] = input_gtf[[0,1,2,5,6,7]].astype('category') # to decrease size of the DF
input_gtf[[3,4]] = input_gtf[[3,4]].astype('int')
genes = input_gtf.loc[input_gtf[2]=='gene'].reset_index(drop=True)
genes['gene_id'] = genes[8].str.split('gene_id "',expand=True)[1].str.split('";',expand=True)[0]
genes['gene_name'] = genes[8].str.split('gene_name "',expand=True)[1].str.split('";',expand=True)[0]

SCINPAS_tsv = SCINPAS.copy()
SCINPAS_tsv = pd.merge(SCINPAS_tsv,genes[['gene_id','gene_name']].drop_duplicates().reset_index(drop=True).rename(columns = {'gene_id':'reassigned_g'}),how='left',on='reassigned_g')
SCINPAS_tsv['rep'] = SCINPAS_tsv.apply(lambda x:int(x['alt_id'].split(':')[1]),1)
SCINPAS_tsv = SCINPAS_tsv.rename(columns={'seqid':'chrom','start':'chromStart','end':'chromEnd','alt_id':'name','score':'score_original','alt_score':'stringency_level','strand':'strand',
                                          '%_of_tissues_with_support':'perc_tissues','num_of_protocols':'nr_prots','PAS_cat':'annotation','gene_id':'gene_id_original','reassigned_g':'gene_id','motif':'repSite_signals'})

SCINPAS_tsv[['chrom','chromStart','chromEnd','name','avg_expression','strand','rep',
             'perc_tissues', 'nr_prots', 'annotation', 
             'gene_name', 'gene_id', 'repSite_signals','stringency_level']+tissues].to_csv(file_paths[organism+'_output_PAS_tsv_gz'],
                                                                       sep=str('\t'),header=True,index=None,
                                                                       quoting=csv.QUOTE_NONE,compression='gzip')
In [185]:
file_paths[organism+'_output_PAS_bed_gz'],file_paths[organism+'_output_PAS_tsv_gz']
Out[185]:
('/scicore/home/zavolan/GROUP/SCINPAS_catalog/mouse/polyAsite_Atlas_3/atlas.clusters.3.0.GRCm38.GENCODE_M25.bed.gz',
 '/scicore/home/zavolan/GROUP/SCINPAS_catalog/mouse/polyAsite_Atlas_3/atlas.clusters.3.0.GRCm38.GENCODE_M25.tsv.gz')

worm¶

In [186]:
organism = 'celegans'

num_of_samples = 55
organism_label = organism if organism!='celegans' else 'worm'

df_name = 'v3.'+organism_label
v3_pas_dir = subdirs['temp_dir']+df_name+'.with_segment_class.tsv'

SCINPAS = pd.read_csv(v3_pas_dir,delimiter="\t",index_col=None,header=0)

ourdir = os.path.dirname(file_paths[organism+'_output_PAS_bed_gz'])
command = 'mkdir -p '+ourdir
out = subprocess.check_output(command, shell=True)
In [187]:
###
# make bed file
###

mp_cols = [col for col in SCINPAS.columns if 'MP' in col]

v3_point = get_points_others(SCINPAS, 'v3')
results = get_points(SCINPAS, mp_cols)
results.append(v3_point)

results_df = pd.DataFrame(results, columns=['source', 'input_thr', 'num_pas', 'motif_percentage'])
results_df['num_pas'] = results_df['num_pas'].astype('int')
results_df['motif_percentage'] = results_df['motif_percentage'].astype('float')
results_df = results_df.sort_values('motif_percentage',ascending=True).reset_index(drop=True)
results_df['short_MP'] = np.round(results_df['motif_percentage'],0).astype('int')

mp_to_report_df = results_df.drop_duplicates('short_MP').reset_index(drop=True)
In [ ]:
 
In [188]:
mp_to_report_df
Out[188]:
source input_thr num_pas motif_percentage short_MP
0 v3 full 66458 51.137561 51
1 v3 25_MP 65738 51.592686 52
2 v3 40_MP 62236 53.264991 53
3 v3 45_MP 58803 54.799925 55
4 v3 50_MP 54481 56.830822 57
5 v3 60_MP 44154 62.583231 63
6 v3 65_MP 39067 66.035273 66
7 v3 70_MP 34024 69.850694 70
8 v3 75_MP 28943 74.083544 74
9 v3 80_MP 22572 79.651781 80
10 v3 85_MP 17502 85.327391 85
11 v3 90_MP 13779 90.420205 90
12 v3 95_MP 7625 95.619672 96
In [189]:
SCINPAS['full'] = 1
SCINPAS['alt_score'] = SCINPAS[list(mp_to_report_df['input_thr'])].mul(list(mp_to_report_df['short_MP'])).max(1) # stringency

SCINPAS['alt_id'] = SCINPAS.apply(lambda x:':'.join((x['id'].split(':')[:3])),1)
SCINPAS['%_of_samples_with_support'] = np.round(SCINPAS['supp']/num_of_samples*100,3)
SCINPAS['num_of_protocols'] = 1

SCINPAS['avg_expression'] = np.round(SCINPAS['score'],6) # just average, since we don't have tissues

tmp = SCINPAS[['segment_class']].drop_duplicates().reset_index(drop=True)
PAS_cat_dict = {'TE':'TE','I':'IN','E':'EX','A':'AL','D_I':'DI','U_I':'UI','N':'NA'}
tmp['PAS_cat'] = tmp['segment_class'].map(PAS_cat_dict)

if 'PAS_cat' in list(SCINPAS.columns):
    SCINPAS = SCINPAS.drop('PAS_cat',1)

SCINPAS = pd.merge(SCINPAS,tmp,how='left',on='segment_class')

motif_cols = ['AAUAAA', 'AUUAAA','UAUAAA', 'AAGAAA', 'AGUAAA', 'AAUACA', 'AAUAUA', 'CAUAAA', 'AAUGAA','GAUAAA', 'ACUAAA', 'AAUAGA']

def fill_motif(x):
    res = ';'.join([elem for elem in motif_cols if x[elem]==1])
    if res=='':
        res = 'NaN'
    return res

tmp = SCINPAS[motif_cols].drop_duplicates().reset_index(drop=True)
tmp['motif'] = tmp.apply(lambda x: fill_motif(x), 1)

if 'motif' in list(SCINPAS.columns):
    SCINPAS = SCINPAS.drop('motif',1)

SCINPAS = pd.merge(SCINPAS,tmp,how='left',on=motif_cols)

tmp = SCINPAS[['seqid']].drop_duplicates().reset_index(drop=True)
map_dict = {'I':1,'II':2,'III':3,'IV':'4','V':5,'X':6}
tmp['chr'] = tmp['seqid'].map(map_dict)
tmp['new_seqid'] = 'chr'+tmp['seqid'] # to comply with UCSC format

if 'chr' in list(SCINPAS.columns):
    SCINPAS = SCINPAS.drop('chr',1)

SCINPAS = pd.merge(SCINPAS,tmp,how='left',on='seqid')

SCINPAS = SCINPAS.sort_values(['chr','start','end','strand']).reset_index(drop=True)

SCINPAS[['new_seqid','start','end','alt_id','avg_expression',
         'strand','%_of_samples_with_support',
         'num_of_protocols','alt_score','PAS_cat','motif']].to_csv(file_paths[organism+'_output_PAS_bed_gz'],
                                                                       sep=str('\t'),header=False,index=None,
                                                                       quoting=csv.QUOTE_NONE,compression='gzip')
In [190]:
len(SCINPAS)
Out[190]:
66458
In [191]:
Counter(SCINPAS['PAS_cat'])
Out[191]:
Counter({'DI': 15113,
         'TE': 26294,
         'IN': 9342,
         'EX': 11353,
         'UI': 3393,
         'AL': 963})
In [194]:
###
# correct tsv file
###

input_gtf = pd.read_csv(file_paths[organism+'_annotation_file'], delimiter = '\t', header = None, skiprows=5)
input_gtf[[0,1,2,5,6,7]] = input_gtf[[0,1,2,5,6,7]].astype('category') # to decrease size of the DF
input_gtf[[3,4]] = input_gtf[[3,4]].astype('int')
genes = input_gtf.loc[input_gtf[2]=='gene'].reset_index(drop=True)
genes['gene_id'] = genes[8].str.split('gene_id "',expand=True)[1].str.split('";',expand=True)[0]
genes['gene_name'] = genes[8].str.split('gene_name "',expand=True)[1].str.split('";',expand=True)[0]

SCINPAS_tsv = SCINPAS.copy()
SCINPAS_tsv = pd.merge(SCINPAS_tsv,genes[['gene_id','gene_name']].drop_duplicates().reset_index(drop=True).rename(columns = {'gene_id':'reassigned_g'}),how='left',on='reassigned_g')
SCINPAS_tsv['rep'] = SCINPAS_tsv.apply(lambda x:int(x['alt_id'].split(':')[1]),1)
SCINPAS_tsv = SCINPAS_tsv.rename(columns={'new_seqid':'chrom','start':'chromStart','end':'chromEnd','alt_id':'name','score':'score_original','alt_score':'stringency_level','strand':'strand',
                                          '%_of_samples_with_support':'perc_samples','num_of_protocols':'nr_prots','PAS_cat':'annotation','gene_id':'gene_id_original','reassigned_g':'gene_id','motif':'repSite_signals'})

SCINPAS_tsv[['chrom','chromStart','chromEnd','name','avg_expression','strand','rep',
             'perc_samples', 'nr_prots', 'annotation', 
             'gene_name', 'gene_id', 'repSite_signals','stringency_level']].to_csv(file_paths[organism+'_output_PAS_tsv_gz'],
                                                                       sep=str('\t'),header=True,index=None,
                                                                       quoting=csv.QUOTE_NONE,compression='gzip')
In [193]:
file_paths[organism+'_output_PAS_bed_gz'],file_paths[organism+'_output_PAS_tsv_gz']
Out[193]:
('/scicore/home/zavolan/GROUP/SCINPAS_catalog/c_elegans/polyAsite_Atlas_3/atlas.clusters.3.0.WBcel235.WormBase_WS293.bed.gz',
 '/scicore/home/zavolan/GROUP/SCINPAS_catalog/c_elegans/polyAsite_Atlas_3/atlas.clusters.3.0.WBcel235.WormBase_WS293.tsv.gz')
In [ ]:
 
In [ ]:
 
In [ ]:
 

Human¶

In [375]:
cur_dir = '/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/'

organism = 'human'
organism_github_subfolder = 'GRCh38'
github_branch = 'dev'
chrom_sizes_file = file_paths[organism+'_chrom_sizes']

github_dir = subdirs[organism+'_github_dir']
out_dir = github_dir+'total/'
tmp_dir = cur_dir+'tmp/'
trackhub_chunks_dir = tmp_dir+organism+'/'
big_data_dir = subdirs['shared_project_dir']+(organism if organism!='celegans' else 'c_elegans')+'/polyAsite_Atlas_3/for_trackhub/'

out = subprocess.check_output('mkdir -p '+tmp_dir +' '+out_dir+' '+trackhub_chunks_dir+' '+big_data_dir, shell=True)

SCINPAS_tsv = pd.read_csv(file_paths[organism+'_output_PAS_tsv_gz'],delimiter="\t",index_col=None,header=0,compression='gzip')
In [4]:
# SCINPAS_tsv = SCINPAS_tsv.loc[SCINPAS_tsv['chrom']=='chr3'].reset_index(drop=True) # temporary filtration, for testing
In [377]:
len(SCINPAS_tsv)
Out[377]:
18432135
In [378]:
# renamed tissues!

tissues = ['brain', 'bloodImmune','eye', 'intervertebral disc', 'nasal mucosa', 'skin', 'corpus cavernosum',  'heart', 'breast',
       'lung', 'tracheal epithelium', 'pancreas','intestine','kidney parenchyma', 'liver', 'uterus', 'ureter','prostate']
In [24]:
# make bigbeds for annotation and bigwigs with average RPM, and tissue-specific RPM values

strands = {'+':'plus','-':'minus'}

command = ''
stringency_groups = {'20-21':[20, 21],'22-29':[23, 26, 29],'30-100':[32, 36, 39, 47, 51, 57, 62, 69, 77, 87, 98]}
for stringency_group in stringency_groups:
    l = stringency_groups[stringency_group]
    for strand in strands:
        SCINPAS_tsv_cur = SCINPAS_tsv.loc[(SCINPAS_tsv['stringency_level'].isin(l))&(SCINPAS_tsv['strand']==strand)].reset_index(drop=True)
        
        ### bigbed - average
        SCINPAS_tsv_cur[['chrom','chromStart','chromEnd','name','stringency_level','strand']].to_csv(out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.bed',sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
        command = command+'sort -k1,1 -k2,2n '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.bed'+' > '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.sorted.bed && '
        command = command+'bedToBigBed -type=bed6 '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.sorted.bed '+chrom_sizes_file+' '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.bb && '
        command = command+'rm '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.bed '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.sorted.bed && '
        
        ### bigwig - average
        strand_long_label = strands[strand]
        SCINPAS_tsv_cur[['chrom','chromStart','chromEnd','avg_expression']].to_csv(tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.bedgraph',sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
        command = command+'sort -k1,1 -k2,2n '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.bedgraph > '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.sorted.bedgraph && '
        command = command+'bedGraphToBigWig '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.sorted.bedgraph '+chrom_sizes_file+' '+out_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.bw && '
        command = command+'rm '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.bedgraph '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.sorted.bedgraph && '

command
Out[24]:
'sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.+.total.bed > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.+.total.sorted.bed && bedToBigBed -type=bed6 /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.+.total.sorted.bed /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.+.total.bb && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.+.total.bed /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.+.total.sorted.bed && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.20-21.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.20-21.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.20-21.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.20-21.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.20-21.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.20-21.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.20-21.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.20-21.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.brain.20-21.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.20-21.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.20-21.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.20-21.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.20-21.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.20-21.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.prostate.20-21.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.20-21.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.20-21.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.-.total.bed > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.-.total.sorted.bed && bedToBigBed -type=bed6 /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.-.total.sorted.bed /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.-.total.bb && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.-.total.bed /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.-.total.sorted.bed && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.20-21.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.20-21.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.20-21.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.20-21.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.20-21.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.20-21.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.20-21.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.20-21.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.brain.20-21.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.20-21.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.20-21.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.20-21.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.20-21.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.20-21.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.prostate.20-21.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.20-21.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.20-21.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.+.total.bed > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.+.total.sorted.bed && bedToBigBed -type=bed6 /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.+.total.sorted.bed /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.+.total.bb && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.+.total.bed /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.+.total.sorted.bed && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.22-29.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.22-29.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.22-29.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.22-29.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.22-29.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.22-29.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.22-29.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.22-29.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.brain.22-29.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.22-29.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.22-29.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.22-29.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.22-29.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.22-29.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.prostate.22-29.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.22-29.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.22-29.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.-.total.bed > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.-.total.sorted.bed && bedToBigBed -type=bed6 /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.-.total.sorted.bed /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.-.total.bb && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.-.total.bed /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.-.total.sorted.bed && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.22-29.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.22-29.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.22-29.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.22-29.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.22-29.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.22-29.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.22-29.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.22-29.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.brain.22-29.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.22-29.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.22-29.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.22-29.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.22-29.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.22-29.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.prostate.22-29.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.22-29.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.22-29.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.+.total.bed > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.+.total.sorted.bed && bedToBigBed -type=bed6 /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.+.total.sorted.bed /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.+.total.bb && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.+.total.bed /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.+.total.sorted.bed && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.30-100.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.30-100.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.30-100.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.30-100.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.30-100.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.30-100.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.30-100.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.30-100.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.brain.30-100.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.30-100.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.30-100.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.30-100.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.30-100.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.30-100.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.prostate.30-100.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.30-100.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.30-100.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.-.total.bed > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.-.total.sorted.bed && bedToBigBed -type=bed6 /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.-.total.sorted.bed /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.-.total.bb && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.-.total.bed /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.-.total.sorted.bed && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.30-100.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.30-100.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.30-100.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.30-100.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.30-100.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.30-100.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.30-100.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.30-100.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.brain.30-100.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.30-100.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.30-100.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.30-100.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.30-100.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.30-100.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.prostate.30-100.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.30-100.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.30-100.minus.sorted.bedgraph && '
In [382]:
### bigwig - tissue-specific

strands = {'+':'plus','-':'minus'}

test_out_dir = github_dir+'over_tissues/'
out = subprocess.check_output('mkdir -p '+test_out_dir, shell=True)

command = ''
for strand in strands:
    strand_long_label = strands[strand]
    # SCINPAS_tsv_cur = SCINPAS_tsv.loc[(SCINPAS_tsv['strand']==strand)].reset_index(drop=True)
    for tissue in tissues:
        outfile = test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.bw'
        if not((os.path.isfile(outfile) and (os.stat(outfile).st_size > 0))):
        # SCINPAS_tsv_cur[['chrom','chromStart','chromEnd',tissue]].to_csv(test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.bedgraph',sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
            command = command+'sort -k1,1 -k2,2n "'+test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.bedgraph" > "'+test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.sorted.bedgraph" && '
            command = command+'bedGraphToBigWig "'+test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.sorted.bedgraph" '+chrom_sizes_file+' "'+test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.bw" && '
            command = command+'rm "'+test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.bedgraph" "'+test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.sorted.bedgraph" && '
command
Out[382]:
'sort -k1,1 -k2,2n "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.intervertebral disc.plus.bedgraph" > "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.intervertebral disc.plus.sorted.bedgraph" && bedGraphToBigWig "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.intervertebral disc.plus.sorted.bedgraph" /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.intervertebral disc.plus.bw" && rm "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.intervertebral disc.plus.bedgraph" "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.intervertebral disc.plus.sorted.bedgraph" && sort -k1,1 -k2,2n "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.nasal mucosa.plus.bedgraph" > "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.nasal mucosa.plus.sorted.bedgraph" && bedGraphToBigWig "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.nasal mucosa.plus.sorted.bedgraph" /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.nasal mucosa.plus.bw" && rm "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.nasal mucosa.plus.bedgraph" "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.nasal mucosa.plus.sorted.bedgraph" && sort -k1,1 -k2,2n "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.corpus cavernosum.plus.bedgraph" > "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.corpus cavernosum.plus.sorted.bedgraph" && bedGraphToBigWig "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.corpus cavernosum.plus.sorted.bedgraph" /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.corpus cavernosum.plus.bw" && rm "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.corpus cavernosum.plus.bedgraph" "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.corpus cavernosum.plus.sorted.bedgraph" && sort -k1,1 -k2,2n "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.tracheal epithelium.plus.bedgraph" > "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.tracheal epithelium.plus.sorted.bedgraph" && bedGraphToBigWig "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.tracheal epithelium.plus.sorted.bedgraph" /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.tracheal epithelium.plus.bw" && rm "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.tracheal epithelium.plus.bedgraph" "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.tracheal epithelium.plus.sorted.bedgraph" && sort -k1,1 -k2,2n "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.kidney parenchyma.plus.bedgraph" > "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.kidney parenchyma.plus.sorted.bedgraph" && bedGraphToBigWig "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.kidney parenchyma.plus.sorted.bedgraph" /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.kidney parenchyma.plus.bw" && rm "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.kidney parenchyma.plus.bedgraph" "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.kidney parenchyma.plus.sorted.bedgraph" && sort -k1,1 -k2,2n "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.intervertebral disc.minus.bedgraph" > "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.intervertebral disc.minus.sorted.bedgraph" && bedGraphToBigWig "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.intervertebral disc.minus.sorted.bedgraph" /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.intervertebral disc.minus.bw" && rm "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.intervertebral disc.minus.bedgraph" "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.intervertebral disc.minus.sorted.bedgraph" && sort -k1,1 -k2,2n "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.nasal mucosa.minus.bedgraph" > "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.nasal mucosa.minus.sorted.bedgraph" && bedGraphToBigWig "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.nasal mucosa.minus.sorted.bedgraph" /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.nasal mucosa.minus.bw" && rm "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.nasal mucosa.minus.bedgraph" "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.nasal mucosa.minus.sorted.bedgraph" && sort -k1,1 -k2,2n "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.corpus cavernosum.minus.bedgraph" > "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.corpus cavernosum.minus.sorted.bedgraph" && bedGraphToBigWig "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.corpus cavernosum.minus.sorted.bedgraph" /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.corpus cavernosum.minus.bw" && rm "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.corpus cavernosum.minus.bedgraph" "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.corpus cavernosum.minus.sorted.bedgraph" && sort -k1,1 -k2,2n "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.tracheal epithelium.minus.bedgraph" > "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.tracheal epithelium.minus.sorted.bedgraph" && bedGraphToBigWig "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.tracheal epithelium.minus.sorted.bedgraph" /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.tracheal epithelium.minus.bw" && rm "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.tracheal epithelium.minus.bedgraph" "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.tracheal epithelium.minus.sorted.bedgraph" && sort -k1,1 -k2,2n "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.kidney parenchyma.minus.bedgraph" > "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.kidney parenchyma.minus.sorted.bedgraph" && bedGraphToBigWig "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.kidney parenchyma.minus.sorted.bedgraph" /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.kidney parenchyma.minus.bw" && rm "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.kidney parenchyma.minus.bedgraph" "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.kidney parenchyma.minus.sorted.bedgraph" && '
In [388]:
# create text chunks for trackhub configuration

max_average = str(int(SCINPAS_tsv['avg_expression'].max()))
max_tissue_level = str(int(SCINPAS_tsv[tissues].max(1).max()))
stringency_groups = {'20-21':[20, 21],'22-29':[23, 26, 29],'30-100':[32, 36, 39, 47, 51, 57, 62, 69, 77, 87, 98]}

strands = {'+':'plus','-':'minus'}
f = open(trackhub_chunks_dir+'hub_total.'+organism+'.txt', "w")

command = ''

bigbed_composite = """track PAS_catalogue
compositeTrack on
allButtonPair on
visibility squish
shortLabel PAS catalogue
longLabel 1. PAS - full catalogue
type bigBed 6 .
spectrum on
scoreMax 100
scoreMin 20
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_catalogue.html

"""
f.write(bigbed_composite)

i = 0
for stringency_group in stringency_groups:
    for strand in strands:
        bigbed = """track PAS_catalogue_"""+str(stringency_group)+'_'+strand+""" 
parent PAS_catalogue on
shortLabel """+str(stringency_group)+"""% motif presence, """+strand+""" strand
longLabel """+('1. PAS - full catalogue' if i==0 else '')+"""
bigDataUrl https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/total/PAS."""+str(stringency_group)+'.'+strand+'.total.bb'+"""
        
"""
        f.write(bigbed)
        i=i+1

multiwig = """track PAS_average_RPM
visibility full
shortLabel PAS mean RPM
longLabel 2. PAS, average RPM across tissues
container multiWig
aggregate transparentOverlay
showSubtrackColorOnUi on
type bigWig 0 """+max_average+"""
viewLimits 0:"""+max_average+"""
autoScale on
maxHeightPixels 90:60:8
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_average_RPM.html

"""
f.write(multiwig)

for stringency_group in stringency_groups:
    for strand in strands:
        strand_long_label = strands[strand]
        multiwig_subtrack = 'track PAS_'+str(stringency_group)+'_'+strand_long_label+'_strand'+"""
shortLabel RPM, """+str(stringency_group)+'%, '+strand+"""
longLabel PAS average RPM, for PAS with """+str(stringency_group)+"""% motif presence, on a ("""+strand+""") strand
parent PAS_average_RPM"""+"""
color """+('4,177,216' if strand=='+' else '255,68,51')+"""
type bigWig 0 """+max_average+"""
bigDataUrl https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/total/PAS."""+str(stringency_group)+'.'+strand_long_label+""".bw
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_average_RPM.html

"""
        f.write(multiwig_subtrack)

i=3
for strand in strands:
    strand_long_label = strands[strand]
    multiwig = """track PAS_tissue_RPM_"""+strand_long_label+"""
visibility dense
shortLabel PAS tissue RPM ("""+strand+""") strand
longLabel """+str(i)+""". PAS tissue RPM ("""+strand+""") strand
container multiWig
aggregate none
showSubtrackColorOnUi on
type bigWig 0 """+max_tissue_level+"""
viewLimits 0:"""+max_tissue_level+"""
autoScale on
maxHeighPixels 120:60:8
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_tissue_RPM_"""+strand_long_label+""".html

"""
    f.write(multiwig)
    i=i+1
    for tissue in tissues:
        subtrack = """track PAS_"""+tissue.replace(' ','_')+"""_RPM_"""+strand_long_label+"""_strand
shortLabel """+tissue+"""
longLabel PAS """+tissue+""" RPM ("""+strand+""") strand
parent PAS_tissue_RPM_"""+strand_long_label+"""
color """+('4,177,216' if strand=='+' else '255,68,51')+"""
type bigWig 0 """+max_tissue_level+"""
bigDataUrl https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/over_tissues/"""+'PAS.'+tissue+'.'+strand_long_label+""".bw
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_tissue_RPM_"""+strand_long_label+""".html

"""
        f.write(subtrack)
f.close()

Mouse¶

In [43]:
cur_dir = '/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/'

organism = 'mouse'
organism_github_subfolder = 'GRCm38'
github_branch = 'dev'
chrom_sizes_file = file_paths[organism+'_chrom_sizes']

github_dir = subdirs[organism+'_github_dir']
out_dir = github_dir+'total/'
tmp_dir = cur_dir+'tmp/'
trackhub_chunks_dir = tmp_dir+organism+'/'
big_data_dir = subdirs['shared_project_dir']+(organism if organism!='celegans' else 'c_elegans')+'/polyAsite_Atlas_3/for_trackhub/'

out = subprocess.check_output('mkdir -p '+tmp_dir +' '+out_dir+' '+trackhub_chunks_dir+' '+big_data_dir, shell=True)

SCINPAS_tsv = pd.read_csv(file_paths[organism+'_output_PAS_tsv_gz'],delimiter="\t",index_col=None,header=0,compression='gzip')
In [4]:
# SCINPAS_tsv = SCINPAS_tsv.loc[SCINPAS_tsv['chrom']=='chr3'].reset_index(drop=True) # temporary filtration, for testing
In [44]:
len(SCINPAS_tsv)
Out[44]:
1750661
In [46]:
tissues = ['Bladder', 'Liver', 'Kidney','Spleen', 'Pancreas','LargeIntestine','Lung', 'Trachea',
        'Fat','MammaryGland', 'Marrow', 'Aorta', 'Heart','Tongue', 
        'LimbMuscle', 'Skin', 'Thymus',
        'unknown']
In [47]:
# make bigbeds for annotation and bigwigs with average RPM, and tissue-specific RPM values

strands = {'+':'plus','-':'minus'}

command = ''
stringency_group = 'all'
for strand in strands:
    SCINPAS_tsv_cur = SCINPAS_tsv.loc[(SCINPAS_tsv['strand']==strand)].reset_index(drop=True)
    
    ### bigbed - average
    SCINPAS_tsv_cur[['chrom','chromStart','chromEnd','name','stringency_level','strand']].to_csv(out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.bed',sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
    command = command+'sort -k1,1 -k2,2n '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.bed'+' > '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.sorted.bed && '
    command = command+'bedToBigBed -type=bed6 '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.sorted.bed '+chrom_sizes_file+' '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.bb && '
    command = command+'rm '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.bed '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.sorted.bed && '
    
    ### bigwig - average
    strand_long_label = strands[strand]
    SCINPAS_tsv_cur[['chrom','chromStart','chromEnd','avg_expression']].to_csv(tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.bedgraph',sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
    command = command+'sort -k1,1 -k2,2n '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.bedgraph > '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.sorted.bedgraph && '
    command = command+'bedGraphToBigWig '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.sorted.bedgraph '+chrom_sizes_file+' '+out_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.bw && '
    command = command+'rm '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.bedgraph '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.sorted.bedgraph && '

command
Out[47]:
'sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.+.total.bed > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.+.total.sorted.bed && bedToBigBed -type=bed6 /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.+.total.sorted.bed /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.+.total.bb && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.+.total.bed /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.+.total.sorted.bed && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.-.total.bed > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.-.total.sorted.bed && bedToBigBed -type=bed6 /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.-.total.sorted.bed /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.-.total.bb && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.-.total.bed /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.-.total.sorted.bed && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.minus.sorted.bedgraph && '
In [49]:
### bigwig - tissue-specific

strands = {'+':'plus','-':'minus'}

test_out_dir = github_dir+'over_tissues/'
out = subprocess.check_output('mkdir -p '+test_out_dir, shell=True)

command = ''
for strand in strands:
    strand_long_label = strands[strand]
    SCINPAS_tsv_cur = SCINPAS_tsv.loc[(SCINPAS_tsv['strand']==strand)].reset_index(drop=True)
    for tissue in tissues:
        SCINPAS_tsv_cur[['chrom','chromStart','chromEnd',tissue]].to_csv(test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.bedgraph',sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
        command = command+'sort -k1,1 -k2,2n '+test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.bedgraph > '+test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.sorted.bedgraph && '
        command = command+'bedGraphToBigWig '+test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.sorted.bedgraph '+chrom_sizes_file+' '+test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.bw && '
        command = command+'rm '+test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.bedgraph '+test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.sorted.bedgraph && '
command
Out[49]:
'sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Bladder.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Bladder.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Bladder.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Bladder.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Bladder.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Bladder.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Liver.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Liver.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Liver.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Liver.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Liver.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Liver.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Kidney.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Kidney.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Kidney.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Kidney.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Kidney.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Kidney.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Spleen.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Spleen.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Spleen.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Spleen.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Spleen.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Spleen.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Pancreas.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Pancreas.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Pancreas.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Pancreas.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Pancreas.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Pancreas.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LargeIntestine.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LargeIntestine.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LargeIntestine.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LargeIntestine.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LargeIntestine.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LargeIntestine.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Lung.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Lung.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Lung.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Lung.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Lung.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Lung.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Trachea.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Trachea.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Trachea.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Trachea.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Trachea.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Trachea.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Fat.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Fat.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Fat.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Fat.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Fat.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Fat.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.MammaryGland.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.MammaryGland.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.MammaryGland.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.MammaryGland.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.MammaryGland.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.MammaryGland.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Marrow.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Marrow.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Marrow.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Marrow.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Marrow.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Marrow.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Aorta.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Aorta.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Aorta.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Aorta.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Aorta.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Aorta.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Heart.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Heart.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Heart.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Heart.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Heart.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Heart.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Tongue.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Tongue.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Tongue.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Tongue.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Tongue.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Tongue.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LimbMuscle.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LimbMuscle.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LimbMuscle.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LimbMuscle.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LimbMuscle.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LimbMuscle.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Skin.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Skin.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Skin.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Skin.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Skin.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Skin.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Thymus.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Thymus.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Thymus.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Thymus.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Thymus.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Thymus.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.unknown.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.unknown.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.unknown.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.unknown.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.unknown.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.unknown.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Bladder.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Bladder.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Bladder.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Bladder.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Bladder.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Bladder.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Liver.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Liver.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Liver.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Liver.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Liver.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Liver.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Kidney.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Kidney.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Kidney.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Kidney.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Kidney.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Kidney.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Spleen.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Spleen.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Spleen.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Spleen.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Spleen.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Spleen.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Pancreas.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Pancreas.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Pancreas.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Pancreas.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Pancreas.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Pancreas.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LargeIntestine.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LargeIntestine.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LargeIntestine.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LargeIntestine.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LargeIntestine.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LargeIntestine.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Lung.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Lung.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Lung.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Lung.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Lung.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Lung.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Trachea.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Trachea.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Trachea.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Trachea.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Trachea.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Trachea.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Fat.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Fat.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Fat.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Fat.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Fat.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Fat.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.MammaryGland.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.MammaryGland.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.MammaryGland.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.MammaryGland.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.MammaryGland.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.MammaryGland.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Marrow.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Marrow.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Marrow.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Marrow.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Marrow.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Marrow.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Aorta.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Aorta.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Aorta.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Aorta.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Aorta.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Aorta.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Heart.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Heart.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Heart.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Heart.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Heart.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Heart.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Tongue.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Tongue.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Tongue.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Tongue.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Tongue.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Tongue.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LimbMuscle.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LimbMuscle.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LimbMuscle.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LimbMuscle.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LimbMuscle.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LimbMuscle.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Skin.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Skin.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Skin.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Skin.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Skin.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Skin.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Thymus.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Thymus.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Thymus.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Thymus.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Thymus.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Thymus.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.unknown.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.unknown.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.unknown.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.unknown.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.unknown.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.unknown.minus.sorted.bedgraph && '
In [51]:
# create text chunks for trackhub configuration

max_average = str(int(SCINPAS_tsv['avg_expression'].max()))
max_tissue_level = str(int(SCINPAS_tsv[tissues].max(1).max()))

strands = {'+':'plus','-':'minus'}
f = open(trackhub_chunks_dir+'hub_total.'+organism+'.txt', "w")

command = ''

stringency_group = 'all'

bigbed_composite = """track PAS_catalogue
compositeTrack on
allButtonPair on
visibility squish
shortLabel PAS catalogue
longLabel 1. PAS - full catalogue
type bigBed 6 .
spectrum on
scoreMax 100
scoreMin 30
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_catalogue.html

"""
f.write(bigbed_composite)

i = 0
for strand in strands:
    bigbed = """track PAS_catalogue_"""+strand+""" 
parent PAS_catalogue on
shortLabel """+strand+""" strand
longLabel """+('1. PAS - full catalogue' if i==0 else '')+"""
bigDataUrl https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/total/PAS."""+str(stringency_group)+'.'+strand+'.total.bb'+"""
    
"""
    f.write(bigbed)
    i=i+1

multiwig = """track PAS_average_RPM
visibility full
shortLabel PAS mean RPM
longLabel 2. PAS, average RPM across tissues
container multiWig
aggregate transparentOverlay
showSubtrackColorOnUi on
type bigWig 0 """+max_average+"""
viewLimits 0:"""+max_average+"""
autoScale on
maxHeightPixels 90:60:8
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_average_RPM.html

"""
f.write(multiwig)

for strand in strands:
    strand_long_label = strands[strand]
    multiwig_subtrack = 'track PAS_'+strand_long_label+'_strand'+"""
shortLabel RPM, """+strand+"""
longLabel PAS average RPM, for PAS on a ("""+strand+""") strand
parent PAS_average_RPM"""+"""
color """+('4,177,216' if strand=='+' else '255,68,51')+"""
type bigWig 0 """+max_average+"""
bigDataUrl https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/total/PAS."""+str(stringency_group)+'.'+strand_long_label+""".bw
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_average_RPM.html

"""
    f.write(multiwig_subtrack)

i=3
for strand in strands:
    strand_long_label = strands[strand]
    multiwig = """track PAS_tissue_RPM_"""+strand_long_label+"""
visibility dense
shortLabel PAS tissue RPM ("""+strand+""") strand
longLabel """+str(i)+""". PAS tissue RPM ("""+strand+""") strand
container multiWig
aggregate none
showSubtrackColorOnUi on
type bigWig 0 """+max_tissue_level+"""
viewLimits 0:"""+max_tissue_level+"""
autoScale on
maxHeighPixels 120:60:8
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_tissue_RPM_"""+strand_long_label+""".html

"""
    f.write(multiwig)
    i=i+1
    for tissue in tissues:
        subtrack = """track PAS_"""+tissue+"""_RPM_"""+strand_long_label+"""_strand
shortLabel """+tissue+"""
longLabel PAS """+tissue+""" RPM ("""+strand+""") strand
parent PAS_tissue_RPM_"""+strand_long_label+"""
color """+('4,177,216' if strand=='+' else '255,68,51')+"""
type bigWig 0 """+max_tissue_level+"""
bigDataUrl https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/over_tissues/"""+'PAS.'+tissue+'.'+strand_long_label+""".bw
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_tissue_RPM_"""+strand_long_label+""".html

"""
        f.write(subtrack)
f.close()

Worm¶

In [55]:
cur_dir = '/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/'

organism = 'celegans'
organism_github_subfolder = 'WBcel235'
github_branch = 'dev'
chrom_sizes_file = file_paths[organism+'_chrom_sizes']

github_dir = subdirs[organism+'_github_dir']
out_dir = github_dir+'total/'
tmp_dir = cur_dir+'tmp/'
trackhub_chunks_dir = tmp_dir+organism+'/'
big_data_dir = subdirs['shared_project_dir']+(organism if organism!='celegans' else 'c_elegans')+'/polyAsite_Atlas_3/for_trackhub/'

out = subprocess.check_output('mkdir -p '+tmp_dir +' '+out_dir+' '+trackhub_chunks_dir+' '+big_data_dir, shell=True)

SCINPAS_tsv = pd.read_csv(file_paths[organism+'_output_PAS_tsv_gz'],delimiter="\t",index_col=None,header=0,compression='gzip')
In [4]:
# SCINPAS_tsv = SCINPAS_tsv.loc[SCINPAS_tsv['chrom']=='chr3'].reset_index(drop=True) # temporary filtration, for testing
In [56]:
len(SCINPAS_tsv)
Out[56]:
66458
In [59]:
# make bigbeds for annotation and bigwigs with average RPM, and tissue-specific RPM values

strands = {'+':'plus','-':'minus'}

command = ''
stringency_group = 'all'
for strand in strands:
    SCINPAS_tsv_cur = SCINPAS_tsv.loc[(SCINPAS_tsv['strand']==strand)].reset_index(drop=True)
    
    ### bigbed - average
    SCINPAS_tsv_cur[['chrom','chromStart','chromEnd','name','stringency_level','strand']].to_csv(out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.bed',sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
    command = command+'sort -k1,1 -k2,2n '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.bed'+' > '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.sorted.bed && '
    command = command+'bedToBigBed -type=bed6 '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.sorted.bed '+chrom_sizes_file+' '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.bb && '
    command = command+'rm '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.bed '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.sorted.bed && '
    
    ### bigwig - average
    strand_long_label = strands[strand]
    SCINPAS_tsv_cur[['chrom','chromStart','chromEnd','avg_expression']].to_csv(tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.bedgraph',sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
    command = command+'sort -k1,1 -k2,2n '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.bedgraph > '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.sorted.bedgraph && '
    command = command+'bedGraphToBigWig '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.sorted.bedgraph '+chrom_sizes_file+' '+out_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.bw && '
    command = command+'rm '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.bedgraph '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.sorted.bedgraph && '

command
Out[59]:
'sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.+.total.bed > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.+.total.sorted.bed && bedToBigBed -type=bed6 /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.+.total.sorted.bed /scicore/home/zavolan/GROUP/Genomes/c_elegans/ce11.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.+.total.bb && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.+.total.bed /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.+.total.sorted.bed && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/c_elegans/ce11.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.-.total.bed > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.-.total.sorted.bed && bedToBigBed -type=bed6 /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.-.total.sorted.bed /scicore/home/zavolan/GROUP/Genomes/c_elegans/ce11.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.-.total.bb && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.-.total.bed /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.-.total.sorted.bed && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/c_elegans/ce11.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.minus.sorted.bedgraph && '
In [61]:
# create text chunks for trackhub configuration

max_average = str(int(SCINPAS_tsv['avg_expression'].max()))

strands = {'+':'plus','-':'minus'}
f = open(trackhub_chunks_dir+'hub_total.'+organism+'.txt', "w")

command = ''

stringency_group = 'all'

bigbed_composite = """track PAS_catalogue
compositeTrack on
allButtonPair on
visibility squish
shortLabel PAS catalogue
longLabel 1. PAS - full catalogue
type bigBed 6 .
spectrum on
scoreMax 100
scoreMin 30
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_catalogue.html

"""
f.write(bigbed_composite)

i = 0
for strand in strands:
    bigbed = """track PAS_catalogue_"""+strand+""" 
parent PAS_catalogue on
shortLabel """+strand+""" strand
longLabel """+('1. PAS - full catalogue' if i==0 else '')+"""
bigDataUrl https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/total/PAS."""+str(stringency_group)+'.'+strand+'.total.bb'+"""
    
"""
    f.write(bigbed)
    i=i+1

multiwig = """track PAS_average_RPM
visibility full
shortLabel PAS mean RPM
longLabel 2. PAS, average RPM
container multiWig
aggregate transparentOverlay
showSubtrackColorOnUi on
type bigWig 0 """+max_average+"""
viewLimits 0:"""+max_average+"""
autoScale on
maxHeightPixels 90:60:8
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_average_RPM.html

"""
f.write(multiwig)

for strand in strands:
    strand_long_label = strands[strand]
    multiwig_subtrack = 'track PAS_'+strand_long_label+'_strand'+"""
shortLabel RPM, """+strand+"""
longLabel PAS average RPM, for PAS on a ("""+strand+""") strand
parent PAS_average_RPM"""+"""
color """+('4,177,216' if strand=='+' else '255,68,51')+"""
type bigWig 0 """+max_average+"""
bigDataUrl https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/total/PAS."""+str(stringency_group)+'.'+strand_long_label+""".bw
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_average_RPM.html

"""
    f.write(multiwig_subtrack)

f.close()

Supplementary figure about overlaps¶

In [285]:
pas = pd.read_csv(subdirs['temp_dir']+'v3.human.with_segment_class.tsv',delimiter="\t",index_col=None,header=0)
v2 = pd.read_csv(subdirs['temp_dir']+'v2.human.with_segment_class.tsv',delimiter="\t",index_col=None,header=0)
deep = pd.read_csv(subdirs['temp_dir']+'DL.human.with_segment_class.tsv',delimiter="\t",index_col=None,header=0)

mp_cols = [col for col in pas.columns if 'MP' in col]
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[285], line 3
      1 pas = pd.read_csv(subdirs['temp_dir']+'v3.human.with_segment_class.tsv',delimiter="\t",index_col=None,header=0)
      2 v2 = pd.read_csv(subdirs['temp_dir']+'v2.human.with_segment_class.tsv',delimiter="\t",index_col=None,header=0)
----> 3 deep = pd.read_csv(subdirs['temp_dir']+'DL.human.human.with_segment_class.tsv',delimiter="\t",index_col=None,header=0)

File ~/miniconda3/envs/py310/lib/python3.10/site-packages/pandas/util/_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs)
    209     else:
    210         kwargs[new_arg_name] = new_arg_value
--> 211 return func(*args, **kwargs)

File ~/miniconda3/envs/py310/lib/python3.10/site-packages/pandas/util/_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    325 if len(args) > num_allow_args:
    326     warnings.warn(
    327         msg.format(arguments=_format_argument_list(allow_args)),
    328         FutureWarning,
    329         stacklevel=find_stack_level(),
    330     )
--> 331 return func(*args, **kwargs)

File ~/miniconda3/envs/py310/lib/python3.10/site-packages/pandas/io/parsers/readers.py:950, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
    935 kwds_defaults = _refine_defaults_read(
    936     dialect,
    937     delimiter,
   (...)
    946     defaults={"delimiter": ","},
    947 )
    948 kwds.update(kwds_defaults)
--> 950 return _read(filepath_or_buffer, kwds)

File ~/miniconda3/envs/py310/lib/python3.10/site-packages/pandas/io/parsers/readers.py:605, in _read(filepath_or_buffer, kwds)
    602 _validate_names(kwds.get("names", None))
    604 # Create the parser.
--> 605 parser = TextFileReader(filepath_or_buffer, **kwds)
    607 if chunksize or iterator:
    608     return parser

File ~/miniconda3/envs/py310/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1442, in TextFileReader.__init__(self, f, engine, **kwds)
   1439     self.options["has_index_names"] = kwds["has_index_names"]
   1441 self.handles: IOHandles | None = None
-> 1442 self._engine = self._make_engine(f, self.engine)

File ~/miniconda3/envs/py310/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1735, in TextFileReader._make_engine(self, f, engine)
   1733     if "b" not in mode:
   1734         mode += "b"
-> 1735 self.handles = get_handle(
   1736     f,
   1737     mode,
   1738     encoding=self.options.get("encoding", None),
   1739     compression=self.options.get("compression", None),
   1740     memory_map=self.options.get("memory_map", False),
   1741     is_text=is_text,
   1742     errors=self.options.get("encoding_errors", "strict"),
   1743     storage_options=self.options.get("storage_options", None),
   1744 )
   1745 assert self.handles is not None
   1746 f = self.handles.handle

File ~/miniconda3/envs/py310/lib/python3.10/site-packages/pandas/io/common.py:856, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    851 elif isinstance(handle, str):
    852     # Check whether the filename is to be opened in binary mode.
    853     # Binary mode does not support 'encoding' and 'newline'.
    854     if ioargs.encoding and "b" not in ioargs.mode:
    855         # Encoding
--> 856         handle = open(
    857             handle,
    858             ioargs.mode,
    859             encoding=ioargs.encoding,
    860             errors=errors,
    861             newline="",
    862         )
    863     else:
    864         # Binary mode
    865         handle = open(handle, ioargs.mode)

FileNotFoundError: [Errno 2] No such file or directory: '/scicore/home/zavolan/GROUP/SCINPAS_catalog/temp/DL.human.human.with_segment_class.tsv'

Overlapping of v3 with v2¶

In [292]:
df1 = pas[['seqid','start','end','strand','segment_class']+mp_cols].copy()
df2 = v2[['seqid','start','end','strand','segment_class']].copy()

df1['id_short'] = df1.index
df1['score_bed'] = 0
df1[['seqid','start','end','id_short','score_bed','strand']].to_csv(subdirs['temp_dir']+'df1.non_sorted.bed', sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)

df2['id_short'] = df2.index
df2['score_bed'] = 0
df2[['seqid','start','end','id_short','score_bed','strand']].to_csv(subdirs['temp_dir']+'df2.non_sorted.bed', sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)

command = 'bedtools sort -i '+subdirs['temp_dir']+'df1.non_sorted.bed > '+subdirs['temp_dir']+'df1.sorted.bed'
out = subprocess.check_output(command, shell=True)
command = 'bedtools sort -i '+subdirs['temp_dir']+'df2.non_sorted.bed > '+subdirs['temp_dir']+'df2.sorted.bed'
out = subprocess.check_output(command, shell=True)
In [ ]:
###
# v2 out of v3
###
In [328]:
mp_cols_to_analyze = ['10_MP','75_MP','90_MP']
palette = ['teal','darkviolet','red']

df_of_label = 'v2.0'
df_in_label = 'v3.0'
mp_labels_to_show = ['20%','62%','87%']
In [293]:
command = 'bedtools closest -d -s -a '+subdirs['temp_dir']+'df1.sorted.bed -b '+subdirs['temp_dir']+'df2.sorted.bed'+' | cut -f4,9,13 > '+subdirs['temp_dir']+'df1_df2.intersection.bed'
command
Out[293]:
'bedtools closest -d -s -a /scicore/home/zavolan/GROUP/SCINPAS_catalog/temp/df1.sorted.bed -b /scicore/home/zavolan/GROUP/SCINPAS_catalog/temp/df2.sorted.bed | cut -f4,9,13 > /scicore/home/zavolan/GROUP/SCINPAS_catalog/temp/df1_df2.intersection.bed'
In [325]:
df_intersected = pd.read_csv(subdirs['temp_dir']+'df1_df2.intersection.bed', delimiter = '\t', header = None)
df_intersected = df_intersected.drop_duplicates(0).reset_index(drop=True) # when there is an overlap of several pas in B with one in A.
df_intersected.columns = ['id_short','id_B','dist']

df_intersected = pd.merge(df1[['id_short','segment_class']+mp_cols_to_analyze],df_intersected[['id_short','dist']],how='left',on='id_short')
df_intersected['dist_log10'] = np.log10(df_intersected['dist']+1)
In [329]:
len(df_intersected),len(df1)
Out[329]:
(18432135, 18432135)
In [330]:
big_cat = 'segment_class'
big_cat_vals = ['TE','E','I','A','D_I','U_I']
big_cat_labels = ['terminal exon','exonic','intronic','alternative\n(exon/intron)','downstream\nintergenic','upstream\nintergenic']

sns.set(font_scale = 1)
sns.set_style("white")

fig, axes = plt.subplots(1, len(big_cat_vals), sharey=True, sharex=True, figsize = (2*len(big_cat_vals), 3))

j=0
for big_cat_val in big_cat_vals:
    df_intersected_cat = df_intersected.loc[df_intersected[big_cat]==big_cat_val].reset_index(drop=True)
    i=0
    for mp_level in mp_cols_to_analyze:
        df_sel = df_intersected_cat.loc[df_intersected_cat[mp_level]==1].reset_index(drop=True)
        ax = sns.ecdfplot(ax=axes[j],data = df_sel,x='dist_log10',color=palette[i],label=(mp_labels_to_show[i] if j==len(big_cat_vals)-1 else None))
        i=i+1
    ax.set(xlabel = '',title=big_cat_labels[j],ylabel='')
    if j==2:
        ax.set(xlabel = 'distance to nearest PAS in '+df_of_label+', $log_{10}$ bp')
    ax.tick_params(bottom=True,left=False)
    if j==0:
        ax.set(ylabel='proportion of PAS from '+df_in_label+'\n\nwith matched PAS from '+df_of_label)
        ax.tick_params(bottom=True,left=True)
    ax.set(xlim=(-0.1,6),ylim=(-0.01,1.01))
    # set tick points
    l = [0,1,2,3,4,5]
    ax.set_xticks(l)
    j=j+1
ax.legend(title="Motif presence in "+df_in_label+", %",bbox_to_anchor=(1.01, 0.5),loc=3,borderaxespad=0.0,ncol=1)  # Set legend with no title

out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'catalog_comparison/', shell=True)
fig.savefig(subdirs['figures_dir']+'catalog_comparison/overlap_prop_'+df_of_label+'_in_'+df_in_label+'.png',bbox_inches='tight',dpi=600)
fig.savefig(subdirs['figures_dir']+'catalog_comparison/overlap_prop_'+df_of_label+'_in_'+df_in_label+'.pdf',bbox_inches='tight',dpi=600)
No description has been provided for this image
In [265]:
###
# v3 out of v2
###
In [314]:
mp_cols_to_analyze = ['10_MP','75_MP','90_MP']

for mp_level in mp_cols_to_analyze:
    df1_sel = df1.loc[df1[mp_level]==1].reset_index(drop=True)
    df1_sel[['seqid','start','end','id_short','score_bed','strand']].to_csv(subdirs['temp_dir']+'df1_'+mp_level+'.non_sorted.bed', sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
    command = 'bedtools sort -i '+subdirs['temp_dir']+'df1_'+mp_level+'.non_sorted.bed > '+subdirs['temp_dir']+'df1_'+mp_level+'.sorted.bed'
    out = subprocess.check_output(command, shell=True)

    command = 'bedtools closest -d -s -b '+subdirs['temp_dir']+'df1_'+mp_level+'.sorted.bed -a '+subdirs['temp_dir']+'df2.sorted.bed'+' | cut -f4,9,13 > '+subdirs['temp_dir']+'df2_df1_'+mp_level+'.intersection.bed'
    out = subprocess.check_output(command, shell=True)
    print(mp_level+' done')
In [332]:
mp_cols_to_analyze = ['10_MP','75_MP','90_MP']
palette = ['teal','darkviolet','red']

df_in_label = 'v2.0'
df_of_label = 'v3.0'
mp_labels_to_show = ['20%','62%','87%']

mp_level = '10_MP'
In [333]:
big_cat = 'segment_class'
big_cat_vals = ['TE','E','I','A','D_I','U_I']
big_cat_labels = ['terminal exon','exonic','intronic','alternative\n(exon/intron)','downstream\nintergenic','upstream\nintergenic']

sns.set(font_scale = 1)
sns.set_style("white")

fig, axes = plt.subplots(1, len(big_cat_vals), sharey=True, sharex=True, figsize = (2*len(big_cat_vals), 3))

i=0
for mp_level in mp_cols_to_analyze:
    df_intersected = pd.read_csv(subdirs['temp_dir']+'df2_df1_'+mp_level+'.intersection.bed', delimiter = '\t', header = None)
    df_intersected = df_intersected.drop_duplicates(0).reset_index(drop=True) # when there is an overlap of several pas in A with one in B.
    df_intersected.columns = ['id_B','id_short','dist']
    
    df_intersected = pd.merge(df2[['id_short','segment_class']],df_intersected[['id_short','dist']],how='left',on='id_short')
    df_intersected['dist_log10'] = np.log10(df_intersected['dist']+1)

    j=0
    for big_cat_val in big_cat_vals:
        df_intersected_cat = df_intersected.loc[df_intersected[big_cat]==big_cat_val].reset_index(drop=True)
        ax = sns.ecdfplot(ax=axes[j],data = df_intersected_cat,x='dist_log10',color=palette[i],label=(mp_labels_to_show[i] if j==len(big_cat_vals)-1 else None))
        
        ax.set(xlabel = '',title=big_cat_labels[j],ylabel='')
        if j==2:
            ax.set(xlabel = 'distance to nearest PAS in '+df_of_label+', $log_{10}$ bp')
        ax.tick_params(bottom=True,left=False)
        if j==0:
            ax.set(ylabel='proportion of PAS from '+df_in_label+'\n\nwith matched PAS from '+df_of_label)
            ax.tick_params(bottom=True,left=True)
        ax.set(xlim=(-0.1,6),ylim=(-0.01,1.01))
        # set tick points
        l = [0,1,2,3,4,5]
        ax.set_xticks(l)
        j=j+1
    i=i+1

ax.legend(title="Motif presence in "+df_of_label+", %",bbox_to_anchor=(1.01, 0.5),loc=3,borderaxespad=0.0,ncol=1)  # Set legend with no title

out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'catalog_comparison/', shell=True)
fig.savefig(subdirs['figures_dir']+'catalog_comparison/overlap_prop_'+df_of_label+'_in_'+df_in_label+'.png',bbox_inches='tight',dpi=600)
fig.savefig(subdirs['figures_dir']+'catalog_comparison/overlap_prop_'+df_of_label+'_in_'+df_in_label+'.pdf',bbox_inches='tight',dpi=600)
No description has been provided for this image

Overlapping of v3 with DL¶

In [335]:
df1 = pas[['seqid','start','end','strand','segment_class']+mp_cols].copy()
df2 = deep[['seqid','start','end','strand','segment_class']].copy()

df1['id_short'] = df1.index
df1['score_bed'] = 0
df1[['seqid','start','end','id_short','score_bed','strand']].to_csv(subdirs['temp_dir']+'df1.non_sorted.bed', sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)

df2['id_short'] = df2.index
df2['score_bed'] = 0
df2[['seqid','start','end','id_short','score_bed','strand']].to_csv(subdirs['temp_dir']+'df2.non_sorted.bed', sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)

command = 'bedtools sort -i '+subdirs['temp_dir']+'df1.non_sorted.bed > '+subdirs['temp_dir']+'df1.sorted.bed'
out = subprocess.check_output(command, shell=True)
command = 'bedtools sort -i '+subdirs['temp_dir']+'df2.non_sorted.bed > '+subdirs['temp_dir']+'df2.sorted.bed'
out = subprocess.check_output(command, shell=True)
In [ ]:
###
# v2 out of v3
###
In [336]:
mp_cols_to_analyze = ['10_MP','75_MP','90_MP']
palette = ['teal','darkviolet','red']

df_of_label = 'DL'
df_in_label = 'v3.0'
mp_labels_to_show = ['20%','62%','87%']
In [337]:
command = 'bedtools closest -d -s -a '+subdirs['temp_dir']+'df1.sorted.bed -b '+subdirs['temp_dir']+'df2.sorted.bed'+' | cut -f4,9,13 > '+subdirs['temp_dir']+'df1_df2.intersection.bed'
command
Out[337]:
'bedtools closest -d -s -a /scicore/home/zavolan/GROUP/SCINPAS_catalog/temp/df1.sorted.bed -b /scicore/home/zavolan/GROUP/SCINPAS_catalog/temp/df2.sorted.bed | cut -f4,9,13 > /scicore/home/zavolan/GROUP/SCINPAS_catalog/temp/df1_df2.intersection.bed'
In [338]:
df_intersected = pd.read_csv(subdirs['temp_dir']+'df1_df2.intersection.bed', delimiter = '\t', header = None)
df_intersected = df_intersected.drop_duplicates(0).reset_index(drop=True) # when there is an overlap of several pas in B with one in A.
df_intersected.columns = ['id_short','id_B','dist']

df_intersected = pd.merge(df1[['id_short','segment_class']+mp_cols_to_analyze],df_intersected[['id_short','dist']],how='left',on='id_short')
df_intersected['dist_log10'] = np.log10(df_intersected['dist']+1)
In [339]:
len(df_intersected),len(df1)
Out[339]:
(18432135, 18432135)
In [341]:
big_cat = 'segment_class'
big_cat_vals = ['TE','E','I','A','D_I','U_I']
big_cat_labels = ['terminal exon','exonic','intronic','alternative\n(exon/intron)','downstream\nintergenic','upstream\nintergenic']

sns.set(font_scale = 1)
sns.set_style("white")

fig, axes = plt.subplots(1, len(big_cat_vals), sharey=True, sharex=True, figsize = (2*len(big_cat_vals), 3))

j=0
for big_cat_val in big_cat_vals:
    df_intersected_cat = df_intersected.loc[df_intersected[big_cat]==big_cat_val].reset_index(drop=True)
    i=0
    for mp_level in mp_cols_to_analyze:
        df_sel = df_intersected_cat.loc[df_intersected_cat[mp_level]==1].reset_index(drop=True)
        ax = sns.ecdfplot(ax=axes[j],data = df_sel,x='dist_log10',color=palette[i],label=(mp_labels_to_show[i] if j==len(big_cat_vals)-1 else None))
        i=i+1
    ax.set(xlabel = '',title=big_cat_labels[j],ylabel='')
    if j==2:
        ax.set(xlabel = 'distance to nearest PAS in '+df_of_label+', $log_{10}$ bp')
    ax.tick_params(bottom=True,left=False)
    if j==0:
        ax.set(ylabel='proportion of PAS from '+df_in_label+'\n\nwith matched PAS from '+df_of_label)
        ax.tick_params(bottom=True,left=True)
    ax.set(xlim=(-0.1,6),ylim=(-0.01,1.01))
    # set tick points
    l = [0,1,2,3,4,5]
    ax.set_xticks(l)
    j=j+1
ax.legend(title="Motif presence in "+df_in_label+", %",bbox_to_anchor=(1.01, 0.5),loc=3,borderaxespad=0.0,ncol=1)  # Set legend with no title

out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'catalog_comparison/', shell=True)
fig.savefig(subdirs['figures_dir']+'catalog_comparison/overlap_prop_'+df_of_label+'_in_'+df_in_label+'.png',bbox_inches='tight',dpi=600)
fig.savefig(subdirs['figures_dir']+'catalog_comparison/overlap_prop_'+df_of_label+'_in_'+df_in_label+'.pdf',bbox_inches='tight',dpi=600)
No description has been provided for this image
In [342]:
###
# v3 out of v2
###
In [343]:
mp_cols_to_analyze = ['10_MP','75_MP','90_MP']

for mp_level in mp_cols_to_analyze:
    df1_sel = df1.loc[df1[mp_level]==1].reset_index(drop=True)
    df1_sel[['seqid','start','end','id_short','score_bed','strand']].to_csv(subdirs['temp_dir']+'df1_'+mp_level+'.non_sorted.bed', sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
    command = 'bedtools sort -i '+subdirs['temp_dir']+'df1_'+mp_level+'.non_sorted.bed > '+subdirs['temp_dir']+'df1_'+mp_level+'.sorted.bed'
    out = subprocess.check_output(command, shell=True)

    command = 'bedtools closest -d -s -b '+subdirs['temp_dir']+'df1_'+mp_level+'.sorted.bed -a '+subdirs['temp_dir']+'df2.sorted.bed'+' | cut -f4,9,13 > '+subdirs['temp_dir']+'df2_df1_'+mp_level+'.intersection.bed'
    out = subprocess.check_output(command, shell=True)
    print(mp_level+' done')
10_MP done
75_MP done
90_MP done
In [344]:
mp_cols_to_analyze = ['10_MP','75_MP','90_MP']
palette = ['teal','darkviolet','red']

df_in_label = 'DL'
df_of_label = 'v3.0'
mp_labels_to_show = ['20%','62%','87%']

mp_level = '10_MP'
In [346]:
big_cat = 'segment_class'
big_cat_vals = ['TE','E','I','A','D_I','U_I']
big_cat_labels = ['terminal exon','exonic','intronic','alternative\n(exon/intron)','downstream\nintergenic','upstream\nintergenic']

sns.set(font_scale = 1)
sns.set_style("white")

fig, axes = plt.subplots(1, len(big_cat_vals), sharey=True, sharex=True, figsize = (2*len(big_cat_vals), 3))

i=0
for mp_level in mp_cols_to_analyze:
    df_intersected = pd.read_csv(subdirs['temp_dir']+'df2_df1_'+mp_level+'.intersection.bed', delimiter = '\t', header = None)
    df_intersected = df_intersected.drop_duplicates(0).reset_index(drop=True) # when there is an overlap of several pas in A with one in B.
    df_intersected.columns = ['id_B','id_short','dist']
    
    df_intersected = pd.merge(df2[['id_short','segment_class']],df_intersected[['id_short','dist']],how='left',on='id_short')
    df_intersected['dist_log10'] = np.log10(df_intersected['dist']+1)

    j=0
    for big_cat_val in big_cat_vals:
        df_intersected_cat = df_intersected.loc[df_intersected[big_cat]==big_cat_val].reset_index(drop=True)
        ax = sns.ecdfplot(ax=axes[j],data = df_intersected_cat,x='dist_log10',color=palette[i],label=(mp_labels_to_show[i] if j==len(big_cat_vals)-1 else None))
        
        ax.set(xlabel = '',title=big_cat_labels[j],ylabel='')
        if j==2:
            ax.set(xlabel = 'distance to nearest PAS in '+df_of_label+', $log_{10}$ bp')
        ax.tick_params(bottom=True,left=False)
        if j==0:
            ax.set(ylabel='proportion of PAS from '+df_in_label+'\n\nwith matched PAS from '+df_of_label)
            ax.tick_params(bottom=True,left=True)
        ax.set(xlim=(-0.1,6),ylim=(-0.01,1.01))
        # set tick points
        l = [0,1,2,3,4,5]
        ax.set_xticks(l)
        j=j+1
    i=i+1

ax.legend(title="Motif presence in "+df_of_label+", %",bbox_to_anchor=(1.01, 0.5),loc=3,borderaxespad=0.0,ncol=1)  # Set legend with no title

out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'catalog_comparison/', shell=True)
fig.savefig(subdirs['figures_dir']+'catalog_comparison/overlap_prop_'+df_of_label+'_in_'+df_in_label+'.png',bbox_inches='tight',dpi=600)
fig.savefig(subdirs['figures_dir']+'catalog_comparison/overlap_prop_'+df_of_label+'_in_'+df_in_label+'.pdf',bbox_inches='tight',dpi=600)
No description has been provided for this image

Overlapping of v2 with DL¶

In [347]:
df1 = v2[['seqid','start','end','strand','segment_class']].copy()
df2 = deep[['seqid','start','end','strand','segment_class']].copy()

df1['id_short'] = df1.index
df1['score_bed'] = 0
df1[['seqid','start','end','id_short','score_bed','strand']].to_csv(subdirs['temp_dir']+'df1.non_sorted.bed', sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)

df2['id_short'] = df2.index
df2['score_bed'] = 0
df2[['seqid','start','end','id_short','score_bed','strand']].to_csv(subdirs['temp_dir']+'df2.non_sorted.bed', sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)

command = 'bedtools sort -i '+subdirs['temp_dir']+'df1.non_sorted.bed > '+subdirs['temp_dir']+'df1.sorted.bed'
out = subprocess.check_output(command, shell=True)
command = 'bedtools sort -i '+subdirs['temp_dir']+'df2.non_sorted.bed > '+subdirs['temp_dir']+'df2.sorted.bed'
out = subprocess.check_output(command, shell=True)
In [ ]:
###
# v2 out of v3
###
In [349]:
color = 'orange'

df_of_label = 'DL'
df_in_label = 'v2.0'
In [348]:
command = 'bedtools closest -d -s -a '+subdirs['temp_dir']+'df1.sorted.bed -b '+subdirs['temp_dir']+'df2.sorted.bed'+' | cut -f4,9,13 > '+subdirs['temp_dir']+'df1_df2.intersection.bed'
command
Out[348]:
'bedtools closest -d -s -a /scicore/home/zavolan/GROUP/SCINPAS_catalog/temp/df1.sorted.bed -b /scicore/home/zavolan/GROUP/SCINPAS_catalog/temp/df2.sorted.bed | cut -f4,9,13 > /scicore/home/zavolan/GROUP/SCINPAS_catalog/temp/df1_df2.intersection.bed'
In [350]:
df_intersected = pd.read_csv(subdirs['temp_dir']+'df1_df2.intersection.bed', delimiter = '\t', header = None)
df_intersected = df_intersected.drop_duplicates(0).reset_index(drop=True) # when there is an overlap of several pas in B with one in A.
df_intersected.columns = ['id_short','id_B','dist']

df_intersected = pd.merge(df1[['id_short','segment_class']],df_intersected[['id_short','dist']],how='left',on='id_short')
df_intersected['dist_log10'] = np.log10(df_intersected['dist']+1)
In [351]:
len(df_intersected),len(df1)
Out[351]:
(568608, 568608)
In [355]:
big_cat = 'segment_class'
big_cat_vals = ['TE','E','I','A','D_I','U_I']
big_cat_labels = ['terminal exon','exonic','intronic','alternative\n(exon/intron)','downstream\nintergenic','upstream\nintergenic']

sns.set(font_scale = 1)
sns.set_style("white")

fig, axes = plt.subplots(1, len(big_cat_vals), sharey=True, sharex=True, figsize = (2*len(big_cat_vals), 3))

j=0
for big_cat_val in big_cat_vals:
    df_intersected_cat = df_intersected.loc[df_intersected[big_cat]==big_cat_val].reset_index(drop=True)
    ax = sns.ecdfplot(ax=axes[j],data = df_intersected_cat,x='dist_log10',color=color)
    ax.set(xlabel = '',title=big_cat_labels[j],ylabel='')
    if j==2:
        ax.set(xlabel = 'distance to nearest PAS in '+df_of_label+', $log_{10}$ bp')
    ax.tick_params(bottom=True,left=False)
    if j==0:
        ax.set(ylabel='proportion of PAS from '+df_in_label+'\n\nwith matched PAS from '+df_of_label)
        ax.tick_params(bottom=True,left=True)
    ax.set(xlim=(-0.1,6),ylim=(-0.01,1.01))
    # set tick points
    l = [0,1,2,3,4,5]
    ax.set_xticks(l)
    j=j+1

out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'catalog_comparison/', shell=True)
fig.savefig(subdirs['figures_dir']+'catalog_comparison/overlap_prop_'+df_of_label+'_in_'+df_in_label+'.png',bbox_inches='tight',dpi=600)
fig.savefig(subdirs['figures_dir']+'catalog_comparison/overlap_prop_'+df_of_label+'_in_'+df_in_label+'.pdf',bbox_inches='tight',dpi=600)
No description has been provided for this image
In [356]:
###
# v3 out of v2
###
In [358]:
command = 'bedtools closest -d -s -b '+subdirs['temp_dir']+'df1.sorted.bed -a '+subdirs['temp_dir']+'df2.sorted.bed'+' | cut -f4,9,13 > '+subdirs['temp_dir']+'df2_df1.intersection.bed'
command
Out[358]:
'bedtools closest -d -s -b /scicore/home/zavolan/GROUP/SCINPAS_catalog/temp/df1.sorted.bed -a /scicore/home/zavolan/GROUP/SCINPAS_catalog/temp/df2.sorted.bed | cut -f4,9,13 > /scicore/home/zavolan/GROUP/SCINPAS_catalog/temp/df2_df1.intersection.bed'
In [359]:
color = 'green'

df_in_label = 'DL'
df_of_label = 'v2.0'
In [360]:
df_intersected = pd.read_csv(subdirs['temp_dir']+'df2_df1.intersection.bed', delimiter = '\t', header = None)
df_intersected = df_intersected.drop_duplicates(0).reset_index(drop=True) # when there is an overlap of several pas in A with one in B.
df_intersected.columns = ['id_B','id_short','dist']

df_intersected = pd.merge(df2[['id_short','segment_class']],df_intersected[['id_short','dist']],how='left',on='id_short')
df_intersected['dist_log10'] = np.log10(df_intersected['dist']+1)
In [362]:
big_cat = 'segment_class'
big_cat_vals = ['TE','E','I','A','D_I','U_I']
big_cat_labels = ['terminal exon','exonic','intronic','alternative\n(exon/intron)','downstream\nintergenic','upstream\nintergenic']

sns.set(font_scale = 1)
sns.set_style("white")

fig, axes = plt.subplots(1, len(big_cat_vals), sharey=True, sharex=True, figsize = (2*len(big_cat_vals), 3))

j=0
for big_cat_val in big_cat_vals:
    df_intersected_cat = df_intersected.loc[df_intersected[big_cat]==big_cat_val].reset_index(drop=True)
    ax = sns.ecdfplot(ax=axes[j],data = df_intersected_cat,x='dist_log10',color=color)
    
    ax.set(xlabel = '',title=big_cat_labels[j],ylabel='')
    if j==2:
        ax.set(xlabel = 'distance to nearest PAS in '+df_of_label+', $log_{10}$ bp')
    ax.tick_params(bottom=True,left=False)
    if j==0:
        ax.set(ylabel='proportion of PAS from '+df_in_label+'\n\nwith matched PAS from '+df_of_label)
        ax.tick_params(bottom=True,left=True)
    ax.set(xlim=(-0.1,6),ylim=(-0.01,1.01))
    # set tick points
    l = [0,1,2,3,4,5]
    ax.set_xticks(l)
    j=j+1
i=i+1

out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'catalog_comparison/', shell=True)
fig.savefig(subdirs['figures_dir']+'catalog_comparison/overlap_prop_'+df_of_label+'_in_'+df_in_label+'.png',bbox_inches='tight',dpi=600)
fig.savefig(subdirs['figures_dir']+'catalog_comparison/overlap_prop_'+df_of_label+'_in_'+df_in_label+'.pdf',bbox_inches='tight',dpi=600)
No description has been provided for this image

Figure 2¶

In [7]:
df = pd.read_csv("/scicore/home/zavolan/moon0000/tissue_specific_pas/75motif_presence_filtered_additional_tissue_specific_ward_30_numpas_3_10.bed",delimiter="\t",index_col=None,header=0)

tissues = ['prostate', 'skin', 'penis', 'intestine', 'heart', 'kidney', 'breast',
       'lung', 'uterus', 'nose', 'pancreas', 'trachea', 'bone', 'eye', 'liver','ureter', 'brain','bloodImmune']

usage_cols = ['usage_'+elem for elem in tissues]

quantile_columns = ["q5", "q10", "q15", "q20", "q80", "q90", "q95", "q99"]
miv_val = df.loc[df['mean_denom']>0]['mean_denom'].min()
df['mean_denom_log2'] = np.log2(df['mean_denom']+miv_val)

###
# calculate number of PAS per gene
###

df['num_PASs'] = 1
gr = df.groupby(['reassigned_g']).agg({'num_PASs':np.sum}).reset_index()
df = pd.merge(df.drop('num_PASs',1),gr,how='left',on='reassigned_g')
In [8]:
###
# calculate number of tissues with no gene expression
###

calc_dict = {}
for elem in tissues:
    calc_dict[elem] = np.sum

gr = df.groupby(['reassigned_g','num_PASs']).agg(calc_dict).reset_index()

def low_q(x):
    return np.quantile(x,0.01)

calc_dict = {}
for elem in tissues:
    calc_dict[elem] = low_q

gr1 = gr.groupby('num_PASs').agg(calc_dict).reset_index()
gr1['min_val'] = gr1[tissues].min(1)

gr = pd.merge(gr,gr1[['num_PASs','min_val']],how='left',on='num_PASs')
gr['num_no_expr'] = gr[tissues].le(list(gr['min_val']),axis=0).sum(1)

a = (~gr[tissues].le(list(gr['min_val']),axis=0))
a[~a] = np.nan
gr = pd.concat([gr[['reassigned_g','num_no_expr']],a],axis=1)

rename_dict = {}
for elem in tissues:
    rename_dict[elem] = elem+'_expr'
gr = gr.rename(columns=rename_dict)

if 'num_no_expr' in df.columns:
    df = df.drop('num_no_expr',1)

df = pd.merge(df,gr,how='left',on='reassigned_g')
expr_status_cols = [elem+'_expr' for elem in tissues]
df['std_adj'] = df[usage_cols].mul(df[expr_status_cols].values,axis=0).std(1)
In [9]:
df['num_no_expr'].quantile(0.9)
Out[9]:
5.0
In [10]:
# select num of PAS
num_PASs_selection = (3,20)
scaling_value = 'q95'
scaling_value_ubiq = 'q20'

# df_filtered = df[df['mean_denom'] > threshold]
df_filtered = df.loc[(df['mean_denom_log2']>-1)&(df['num_no_expr']<1)&(df['num_PASs']>=num_PASs_selection[0])&(df['num_PASs']<=num_PASs_selection[1])].reset_index(drop=True)
df_filtered['avg_usage'] = df_filtered[usage_cols].mean(1) # mean of means

df_filtered['ts'] = (df_filtered['std']>df_filtered[scaling_value]).astype('int')
df_filtered['ubiq'] = ((df_filtered['avg_usage']>0.5)&(df_filtered['std']<df_filtered[scaling_value_ubiq])).astype('int')
df_filtered['low'] = ((df_filtered['avg_usage']<0.5)&(df_filtered['std']<df_filtered[scaling_value_ubiq])).astype('int')
df_filtered['cat'] = df_filtered['ts']*3+df_filtered['ubiq']*2+df_filtered['low']*1

# Set figure size as requested
sns.set(font_scale = 0.5)
sns.set_style("white")

fig, ax = plt.subplots(1, 1, sharey=False, sharex=False, figsize = (2.8, 1))

x_feature,y_feature = 'mean_denom_log2','std'
xlabel, ylabel = 'tissue-avg host gene expression, $log_2$ RPM','st.dev. of PAS usage\nover tissues'
ax = sns.scatterplot(x = x_feature, y = y_feature, data = df_filtered.loc[df_filtered['cat']==0], color = 'royalblue', alpha = 0.03, s = 0.5)
ax = sns.scatterplot(x = x_feature, y = y_feature, data = df_filtered.loc[df_filtered['cat']==3], color = 'green', alpha = 0.06, s = 0.5)
ax = sns.scatterplot(x = x_feature, y = y_feature, data = df_filtered.loc[df_filtered['cat']==1], color = 'magenta', alpha = 0.06, s = 2.5)
ax = sns.scatterplot(x = x_feature, y = y_feature, data = df_filtered.loc[df_filtered['cat']==2], color = 'orange', alpha = 1, s = 2.5)


quantiles_to_show = ["q20", "q95"]
palette = ['brown','green']
scaling_value = 'q95'

tmp = df[quantiles_to_show+[x_feature]].drop_duplicates().reset_index(drop=True)

i = 0
for quantile_ in quantiles_to_show:
    ax = sns.lineplot(x = x_feature, y = quantile_, data = tmp, label = quantile_, linewidth = 1, color = palette[i])
    i=i+1
ax.set(xlim = (-1,10),ylim=(-0.03,0.5),xlabel=xlabel,ylabel=ylabel)
ax.tick_params(left=True, bottom=True)
ax.legend(title="quantile of st. dev.",bbox_to_anchor=(0.2, 1),loc=3,borderaxespad=0.0,ncol=2)  # Set legend with no title

# out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'over_tissues/', shell=True)
# fig.savefig(subdirs['figures_dir']+'over_tissues/2A.png',bbox_inches='tight',dpi=600)
# fig.savefig(subdirs['figures_dir']+'over_tissues/2A.pdf',bbox_inches='tight',dpi=600)
Out[10]:
<matplotlib.legend.Legend at 0x14e175289c90>
No description has been provided for this image
In [12]:
len(df_filtered['reassigned_g'].unique())
Out[12]:
11888
In [78]:
Counter(df_filtered['cat'])
Out[78]:
Counter({0: 82781, 3: 4465, 1: 20358, 2: 163})
In [218]:
sns.set(font_scale = 0.5)
sns.set_style("white")

fig, axes = plt.subplots(1, 1, sharey=False, sharex=True, figsize = (2.8, 1))

bins_non_ts = list(pd.Series(range(0,105,5))/100)
bins_ts = list(pd.Series(range(0,105,5))/100)
ax = sns.ecdfplot(data = df_filtered.loc[df_filtered['cat']==0],x='avg_usage',color = 'royalblue')
ax = sns.ecdfplot(data = df_filtered.loc[df_filtered['cat']==3],x='avg_usage',color = 'green', alpha = 1, )
ax = sns.ecdfplot(data = df_filtered.loc[df_filtered['cat']==2],x='avg_usage',color = 'orange', alpha = 1,)
ax = sns.ecdfplot(data = df_filtered.loc[df_filtered['cat']==1],x='avg_usage',color = 'magenta', alpha = 1,)

ax.set(ylabel='CDF',xlabel='average PAS usage over tissues',ylim=(-0.01,1.01))
ax.tick_params(left=True, bottom=True)

# ax = sns.scatterplot(ax=axes[1],data = df_filtered.loc[df_filtered['cat']==0],y='std',x='avg_usage',color = 'royalblue', alpha = 0.03, s = 0.5)
# ax = sns.scatterplot(ax=axes[1],data = df_filtered.loc[df_filtered['cat']==3],y='std',x='avg_usage',color = 'green', alpha = 0.2, s = 0.5)
# ax = sns.scatterplot(ax=axes[1],data = df_filtered.loc[df_filtered['cat']==2],y='std',x='avg_usage',color = 'orange', alpha = 0.2, s = 0.5)
# ax = sns.scatterplot(ax=axes[1],data = df_filtered.loc[df_filtered['cat']==1],y='std',x='avg_usage',color = 'magenta', alpha = 0.2, s = 0.5)
# ax.tick_params(left=True, bottom=True)
# ax.set(xlabel='average PAS usage over tissues',ylabel='st.dev. of PAS usage\nover tissues')

# out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'over_tissues/', shell=True)
# fig.savefig(subdirs['figures_dir']+'over_tissues/suppl_avg_usage_vs_std.png',bbox_inches='tight',dpi=600)
# fig.savefig(subdirs['figures_dir']+'over_tissues/suppl_avg_usage_vs_std.pdf',bbox_inches='tight',dpi=600)
No description has been provided for this image
In [221]:
sns.set(font_scale = 0.5)
sns.set_style("white")

fig, axes = plt.subplots(1, 1, sharey=False, sharex=True, figsize = (2.8, 1))

bins_non_ts = list(pd.Series(range(0,105,5))/100)
bins_ts = list(pd.Series(range(0,105,5))/100)
ax = sns.histplot(data = df_filtered.loc[df_filtered['cat']==1],x='avg_usage',color = 'magenta', alpha = 1, stat='density',bins=bins_non_ts,element='step')
ax = sns.histplot(data = df_filtered.loc[df_filtered['cat']==0],x='avg_usage',color = 'royalblue', alpha = 1, stat='density',bins=bins_non_ts,element='step')
ax = sns.histplot(data = df_filtered.loc[df_filtered['cat']==3],x='avg_usage',color = 'green', alpha = 1, stat='density',bins=bins_ts,element='step')
ax = sns.histplot(data = df_filtered.loc[df_filtered['cat']==2],x='avg_usage',color = 'orange', alpha = 1, stat='density',bins=bins_non_ts,element='step')


ax.set(ylabel='Density',xlabel='average PAS usage over tissues')
ax.tick_params(left=True, bottom=True)

# ax = sns.scatterplot(ax=axes[1],data = df_filtered.loc[df_filtered['cat']==0],y='std',x='avg_usage',color = 'royalblue', alpha = 0.03, s = 0.5)
# ax = sns.scatterplot(ax=axes[1],data = df_filtered.loc[df_filtered['cat']==3],y='std',x='avg_usage',color = 'green', alpha = 0.2, s = 0.5)
# ax = sns.scatterplot(ax=axes[1],data = df_filtered.loc[df_filtered['cat']==2],y='std',x='avg_usage',color = 'orange', alpha = 0.2, s = 0.5)
# ax = sns.scatterplot(ax=axes[1],data = df_filtered.loc[df_filtered['cat']==1],y='std',x='avg_usage',color = 'magenta', alpha = 0.2, s = 0.5)
# ax.tick_params(left=True, bottom=True)
# ax.set(xlabel='average PAS usage over tissues',ylabel='st.dev. of PAS usage\nover tissues')

out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'over_tissues/', shell=True)
fig.savefig(subdirs['figures_dir']+'over_tissues/suppl_avg_usage_vs_std.png',bbox_inches='tight',dpi=600)
fig.savefig(subdirs['figures_dir']+'over_tissues/suppl_avg_usage_vs_std.pdf',bbox_inches='tight',dpi=600)
No description has been provided for this image
In [27]:
df_filtered['t'] = 1
df_filtered['std_1'] = np.round(df_filtered['std'],2)
gr = df_filtered.groupby('std_1').agg({'t':np.sum}).reset_index().sort_values('t',ascending=False)

sns.histplot(df_filtered.loc[df_filtered['std_1'].isin([0.23,0.24])]['num_no_expr'],color='red',stat='density',bins=range(0,30))
sns.histplot(df_filtered.loc[~df_filtered['std_1'].isin([0.23,0.24])]['num_no_expr'],color='blue',stat='density',bins=range(0,30))
Out[27]:
<AxesSubplot: xlabel='num_no_expr', ylabel='Density'>
No description has been provided for this image
In [558]:
# experimental, does not work that good
# scaling_value = 'q90'

# df_scores = pd.concat([df_filtered[['id','reassigned_g','std',scaling_value]],(df_filtered[usage_cols].sub(df_filtered[usage_cols].mean(1),axis=0)).div(df_filtered[scaling_value],axis=0)],axis=1)
In [487]:
num_PASs_selection = (3,10)
scaling_value = 'q95'
data = df.loc[(df['std']>df[scaling_value])&(df['mean_denom_log2']>-1.5)&(df['num_PASs']>=num_PASs_selection[0])&(df['num_PASs']<=num_PASs_selection[1])].reset_index(drop=True)
len(data)
Out[487]:
6575
In [488]:
# smth is wrong with cluster assignment
data = df.loc[df['Cluster']!='not_considered'].reset_index(drop=True)
len(data),data['mean_denom_log2'].min()
Out[488]:
(1644, -1.4603511946072305)
In [2]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
In [4]:
import scipy.cluster
In [ ]:
scipy.
In [195]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from matplotlib.colors import Normalize, LinearSegmentedColormap

scaling_value = 'q95'
method = 'ward'
num_PASs_selection = (3,20)
n_clusters = 30

data = df.loc[(df['std']>df[scaling_value])&(df['mean_denom_log2']>(-1))&(df['num_no_expr']<1)&(df['num_PASs']>=num_PASs_selection[0])&(df['num_PASs']<=num_PASs_selection[1])].reset_index(drop=True)
# data = df.loc[(df['std']>df[scaling_value])&(df['mean_denom_log2']>-1)&(df['num_PASs']>=num_PASs_selection[0])&(df['num_PASs']<=num_PASs_selection[1])].reset_index(drop=True)
# data = df.loc[df['Cluster']!='not_considered'].reset_index(drop=True) # there, filtering by the number of PAS is done after selection by quantile and mean_denom_log2 
ids = data['id']
data = data[usage_cols]

# Standardize the data
# scaler = StandardScaler()
# scaled_data = scaler.fit_transform(data)
scaled_data = data.values

# Perform hierarchical clustering
linkage_matrix = linkage(scaled_data, method = method)

# Generate dendrogram to capture leaf order
dendro = dendrogram(linkage_matrix, no_plot = True)
ordered_leaves = dendro['leaves']

# Assign cluster labels based on the number of clusters
cluster_labels = fcluster(linkage_matrix, t = n_clusters, criterion = 'maxclust')
data['Cluster'] = cluster_labels

# Reorder the data according to the dendrogram leaves
data['DendroOrder'] = ordered_leaves

# Sort by Cluster and Dendrogram, and sort 'id' column in the same order
data_sorted = data.sort_values(by = ['Cluster', 'DendroOrder'])
ids_sorted = ids.iloc[data_sorted.index] # Sort 'id' alongside data

# Create a color palette for clusters
unique_clusters = np.unique(cluster_labels)
palette = sns.color_palette('hls', len(unique_clusters))

# Map clusters to colors
cluster_color_map = {cluster: palette[i] for i, cluster in enumerate(unique_clusters)}
row_colors = data_sorted['Cluster'].map(cluster_color_map)

# track row positions
data_sorted['row_pos'] = list(range(0,len(data_sorted)))

# Drop 'Cluster' and 'DendroOrder' for the heatmap, but keep the sorted 'id'
data_transformed = data_sorted.drop(columns = ['Cluster', 'DendroOrder','row_pos'])

# add id so that we can then retrieve cluster for PAS, save to data_clustered dataframe
data_sorted['id'] = list(ids_sorted)
data_clustered = data_sorted.copy().reset_index(drop=True)

# Custom colormap for the heatmap
cmap = LinearSegmentedColormap.from_list("custom_cmap", ["white", "blue"])
norm = Normalize(vmin=data_transformed.min().min(), vmax=data_transformed.max().max())

data_transformed.columns = [col.replace('usage_', '') for col in data_transformed.columns]

# rename tissues
tissue_rename_dict = {}
tissue_rename_dict['trachea']= 'tracheal epithelium'
tissue_rename_dict['nose']= 'nasal mucosa'
tissue_rename_dict['kidney']= 'kidney parenchyma'
tissue_rename_dict['intestine'] = 'intestine'
tissue_rename_dict['bone'] = 'intervertebral disc'
tissue_rename_dict['penis'] = 'corpus cavernosum'

data_transformed = data_transformed.rename(columns = tissue_rename_dict)
In [197]:
sns.set(font_scale = 0.5)
sns.set_style("white")
# Generate the clustermap
g = sns.clustermap(data_transformed, method=method, row_cluster=False, col_cluster=True, vmin=0,vmax=1,
                   row_colors=list(row_colors), cmap=cmap, norm=norm, figsize=(5.5, 5.5), cbar_kws = {'label':'PAS usage'}, cbar_pos=(0.95, 0.5, 0.02, 0.18)) # Adjust the 'shrink' value to make the color bar narrower
# Adjust x-tick labels
x_labels = g.ax_heatmap.get_xticklabels()

# Reapply the customized labels
g.ax_heatmap.set_xticklabels(labels = x_labels, rotation=60, ha='right',va='top',rotation_mode='anchor')
g.ax_heatmap.tick_params(right=False, bottom=True,width=0.5)
g.ax_heatmap.set_yticklabels(labels = [])
g.ax_heatmap.text(-1.2,int(len(data_transformed)/2),'cluster',rotation=90)

g.ax_cbar.tick_params(width=0.5)

gr = data_sorted.groupby('Cluster').agg({'row_pos':max}).reset_index()
for index,row in gr.head(len(gr)-1).iterrows():
    g.ax_heatmap.text(-0.65,row['row_pos'],'----',color='black',va='center')
    # g.ax_heatmap.text(-1.65,row['row_pos'],str(row['Cluster']),color='black',va='center')

out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'over_tissues/', shell=True)
g.savefig(subdirs['figures_dir']+'over_tissues/2B.png',bbox_inches='tight',dpi=600)
g.savefig(subdirs['figures_dir']+'over_tissues/2B.pdf',bbox_inches='tight',dpi=600)
No description has been provided for this image
In [28]:
# we need this for the correct segment class
organism = 'human'

organism_label = organism if organism!='celegans' else 'worm'

df_name = 'v3.'+organism_label
v3_pas_dir = subdirs['temp_dir']+df_name+'.with_segment_class.tsv'

SCINPAS = pd.read_csv(v3_pas_dir,delimiter="\t",index_col=None,header=0)
In [80]:
# tissue-specific vs non-tissue-specific
num_PASs_selection = (3,20)
data = df.loc[(df['mean_denom_log2']>(-1))&(df['num_no_expr']==0)&(df['num_PASs']>=num_PASs_selection[0])&(df['num_PASs']<=num_PASs_selection[1])].reset_index(drop=True)

data = pd.merge(data,SCINPAS[['id','segment_class']],how='left',on='id')
data = pd.merge(data.drop('Cluster',1),data_clustered[['id','Cluster']],how='left',on='id')
data['Cluster'] = data['Cluster'].fillna(-1).astype('int')
In [83]:
scaling_value = 'q95'
scaling_value_ubiq = 'q20'
data['avg_usage'] = data[usage_cols].mean(1) # mean of means
data['ts'] = (data['std']>data[scaling_value]).astype('int')
data['ubiq'] = ((data['avg_usage']>0.5)&(data['std']<data[scaling_value_ubiq])).astype('int')
data['low'] = ((data['avg_usage']<0.5)&(data['std']<data[scaling_value_ubiq])).astype('int')
data['cat'] = data['ts']*3+data['ubiq']*2+data['low']*1
In [84]:
Counter(data['cat'])
Out[84]:
Counter({0: 82781, 3: 4465, 1: 20358, 2: 163})
In [85]:
# try to separate particular clusters

from scipy.stats import binom

data['t']=1
gr = data.groupby(['Cluster','segment_class']).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby('Cluster').agg({'t':sum}).reset_index().rename(columns={'t':'t_sum'}),how='inner',on='Cluster')
gr['%'] = np.round(gr['t']/gr['t_sum']*100,2)
gr['prop'] = gr['t']/gr['t_sum']
gr['%_ci_up'] = np.round(gr.apply(lambda x:binom.ppf(0.975, x['t_sum'], x['prop']),1)/gr['t_sum']*100,2)
gr['%_ci_down'] = np.round(gr.apply(lambda x:binom.ppf(0.025, x['t_sum'], x['prop']),1)/gr['t_sum']*100,2)
In [90]:
# try to separate tissue-specific vs non-tissue-specific

from scipy.stats import binom

data['t']=1
gr = data.groupby(['cat','segment_class']).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby('cat').agg({'t':sum}).reset_index().rename(columns={'t':'t_sum'}),how='inner',on='cat')
gr['%'] = np.round(gr['t']/gr['t_sum']*100,2)
gr['prop'] = gr['t']/gr['t_sum']

gr['%_ci_up'] = np.round(gr.apply(lambda x:binom.ppf(0.975, x['t_sum'], x['prop']),1)/gr['t_sum']*100,2)
gr['%_ci_down'] = np.round(gr.apply(lambda x:binom.ppf(0.025, x['t_sum'], x['prop']),1)/gr['t_sum']*100,2)

big_cat = 'segment_class'
big_cat_vals = ['TE','E','I','A','D_I','U_I','N']
big_cat_labels = ['terminal exon','exonic','intronic','alternative\n(exon/intron)','downstream\nintergenic','upstream\nintergenic','ambigous']

map_dict = {}
i=0
for elem in big_cat_vals:
    map_dict[elem] = big_cat_labels[i]
    i=i+1
gr['segment_class_long'] = gr['segment_class'].map(map_dict)

map_dict = {0:'other PAS',1:'rarely used PAS',2:'ubiquitous PAS',3:'tissue-specific PAS',}
gr['ts'] = gr['cat'].map(map_dict)
In [91]:
gr.loc[gr['segment_class']=='TE']
Out[91]:
cat segment_class t t_sum % prop %_ci_up %_ci_down segment_class_long ts
4 0 TE 26453 82781 31.96 0.319554 32.27 31.64 terminal exon other PAS
10 1 TE 3577 20358 17.57 0.175705 18.10 17.05 terminal exon rarely used PAS
15 2 TE 133 163 81.60 0.815951 87.12 75.46 terminal exon ubiquitous PAS
21 3 TE 2630 4465 58.90 0.589026 60.34 57.45 terminal exon tissue-specific PAS
In [128]:
from statsmodels.stats import proportion as smprop

x_feature, y_feature, hue_feature = 'segment_class_long', '%','ts'
order = ['terminal exon','exonic','intronic','alternative\n(exon/intron)','downstream\nintergenic','upstream\nintergenic']
hue_order = ['rarely used PAS','other PAS','tissue-specific PAS','ubiquitous PAS']
palette = ['magenta','royalblue','green','orange']
dodge = 1.012

# reorder dataframe
reorder_dict_x = {}
i=0
for x_val in order:
    reorder_dict_x[x_val] = i
    i=i+1
gr['x_order'] = gr[x_feature].map(reorder_dict_x)
reorder_dict_hue = {}
i=0
for hue_val in hue_order:
    reorder_dict_hue[hue_val] = i
    i=i+1
gr['hue_order'] = gr[hue_feature].map(reorder_dict_hue)
gr_reordered = gr.loc[(~gr['x_order'].isna())&(~gr['hue_order'].isna())].sort_values(['x_order','hue_order']).reset_index(drop=True)
gr_reordered['x_order_adj'] = gr_reordered['x_order']+dodge*((gr_reordered['hue_order']+1)/(len(hue_order)+1)-0.5)

sns.set(font_scale=0.5)
sns.set_style("white")
fig, axes = plt.subplots(1,1,sharey=True,sharex=True, figsize=(2, 1))

# ax = sns.pointplot(data = gr,x=x_feature,y=y_feature,hue=hue,order = order,hue_order = hue_order,palette=palette,dodge=dodge)
ax = sns.barplot(data = gr,x=x_feature,y=y_feature,hue=hue_feature,order = order,hue_order = hue_order,palette=palette)

ax.errorbar(x=list(gr_reordered['x_order_adj']), y=list(gr_reordered[y_feature]), yerr=[list(gr_reordered[y_feature]-gr_reordered['%_ci_down']),list(gr_reordered['%_ci_up']-gr_reordered[y_feature])], 
            elinewidth = 0.5,capsize=0.7, capthick=0.2,fmt="none", color="black")

ax.legend_.remove()
ax.set_xticklabels(labels = ax.get_xticklabels(), rotation=60, ha='right',va='top',rotation_mode='anchor')
ax.set(xlabel='',ylabel='% of PAS in class')
ax.tick_params(left=True, bottom=True,width=0.5)

out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'over_tissues/', shell=True)
fig.savefig(subdirs['figures_dir']+'over_tissues/2C.png',bbox_inches='tight',dpi=600)
fig.savefig(subdirs['figures_dir']+'over_tissues/2C.pdf',bbox_inches='tight',dpi=600)
No description has been provided for this image
In [132]:
# association with motif presence

data['t']=1
gr = data.groupby(['cat','all_motif']).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby('cat').agg({'t':sum}).reset_index().rename(columns={'t':'t_sum'}),how='inner',on='cat')
gr['%'] = np.round(gr['t']/gr['t_sum']*100,2)
gr['prop'] = gr['t']/gr['t_sum']
gr['%_ci_up'] = np.round(gr.apply(lambda x:binom.ppf(0.975, x['t_sum'], x['prop']),1)/gr['t_sum']*100,2)
gr['%_ci_down'] = np.round(gr.apply(lambda x:binom.ppf(0.025, x['t_sum'], x['prop']),1)/gr['t_sum']*100,2)
gr = gr.loc[gr['all_motif']==1].reset_index(drop=True)
map_dict = {0:'other PAS',1:'rarely used PAS',2:'ubiquitous PAS',3:'tissue-specific PAS',}
gr['ts'] = gr['cat'].map(map_dict)

from statsmodels.stats import proportion as smprop

x_feature, y_feature, hue_feature = 'all_motif', '%','ts'
order = [1]
hue_order = ['rarely used PAS','other PAS','tissue-specific PAS','ubiquitous PAS']
palette = ['magenta','royalblue','green','orange']
dodge = 1.012

# reorder dataframe
reorder_dict_x = {}
i=0
for x_val in order:
    reorder_dict_x[x_val] = i
    i=i+1
gr['x_order'] = gr[x_feature].map(reorder_dict_x)
reorder_dict_hue = {}
i=0
for hue_val in hue_order:
    reorder_dict_hue[hue_val] = i
    i=i+1
gr['hue_order'] = gr[hue_feature].map(reorder_dict_hue)
gr_reordered = gr.loc[(~gr['x_order'].isna())&(~gr['hue_order'].isna())].sort_values(['x_order','hue_order']).reset_index(drop=True)
gr_reordered['x_order_adj'] = gr_reordered['x_order']+dodge*((gr_reordered['hue_order']+1)/(len(hue_order)+1)-0.5)

sns.set(font_scale=0.5)
sns.set_style("white")
fig, axes = plt.subplots(1,1,sharey=True,sharex=True, figsize=(0.35, 1))

# ax = sns.pointplot(data = gr,x=x_feature,y=y_feature,hue=hue,order = order,hue_order = hue_order,palette=palette,dodge=dodge)
ax = sns.barplot(data = gr,x=x_feature,y=y_feature,hue=hue_feature,order = order,hue_order = hue_order,palette=palette)

ax.errorbar(x=list(gr_reordered['x_order_adj']), y=list(gr_reordered[y_feature]-0.5), yerr=[list(gr_reordered[y_feature]-gr_reordered['%_ci_down']),list(gr_reordered['%_ci_up']-gr_reordered[y_feature])], 
            elinewidth = 0.5,capsize=0.7, capthick=0.2,fmt="none", color="black")

ax.legend_.remove()
ax.set_xticklabels(labels = ax.get_xticklabels(), rotation=60, ha='right',va='top',rotation_mode='anchor')
ax.set(xlabel='',ylabel='Motif presence, %',xticks=[])
ax.tick_params(left=False, bottom=False,right=True,labelright=True,labelleft=False,width=0.5)
ax.yaxis.set_label_position("right")

out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'over_tissues/', shell=True)
fig.savefig(subdirs['figures_dir']+'over_tissues/2C_add.png',bbox_inches='tight',dpi=600)
fig.savefig(subdirs['figures_dir']+'over_tissues/2C_add.pdf',bbox_inches='tight',dpi=600)
No description has been provided for this image

Prepare metadata¶

human¶

In [133]:
merged_num_PAS_each_class = pd.read_csv('./merged_num_PAS_each_class.csv',delimiter=",",index_col=None,header=0)
human_cell_ATLAS_metadata_fastq_final_final_alter = pd.read_csv('./human_cell_ATLAS_metadata_fastq_final_final_alter.csv',delimiter=",",index_col=None,header=0)
NeMO_manifest_metadata_23017c0cfd = pd.read_csv('./NeMO_manifest_metadata_23017c0cfd.tsv',delimiter="\t",index_col=None,header=0)
In [134]:
import re

def get_sample(x):
    sample = '10X_'+x['project.project_core.project_short_name']+'_'+x['file_name'].split('_')[0].replace('-','')
    if len(x['file_name'].split('_'))>1:
        tmp = x['file_name'].split('_')[1]
        if ((re.match('S[0-9]',tmp) and (
                                        (not x['file_name'].split('_')[0].startswith('HCAHeart')) and
                                        (not x['file_name'].split('_')[0].startswith('SIGAA')) and 
                                        (not x['file_name'].split('_')[0].startswith('SIGAC')) and 
                                        (not x['file_name'].split('_')[0].startswith('SIGAD4')) and 
                                        (not x['file_name'].split('_')[0].startswith('SIGAE4')) and 
                                        (not x['file_name'].split('_')[0].startswith('SIGAF4')) and 
                                        (not x['file_name'].split('_')[0].startswith('SIGAG4')) and 
                                        (not x['file_name'].split('_')[0].startswith('SIGAH4')) and 
                                        (not x['file_name'].split('_')[0].startswith('sample')) and 
                                        (not x['file_name'].split('_')[0].startswith('SRR')) and 
                                        (not x['file_name'].split('_')[0].startswith('CZIKidney')) and 
                                        (not x['file_name'].split('_')[0].startswith('3-')))) or 
                                             tmp in ['bamtofastq','cd45pos','Cornea','MUC9105'] or
                                               tmp.startswith('MUC11') or 
                                                tmp.startswith('Endo')):
            sample=sample+tmp
        elif x['file_name'].split('_')[1:3] == ['Adult','Cornea']:
            sample=sample+'AdultCornea'
        elif tmp in ['scRNAseq','HS','D17PrPzF','D35PrTzF','D17PrTzF','D27PrTzF','D35PrPzF','1','4','2','3','5','6','7'] or tmp.startswith('CD45pos') or tmp.startswith('TotalHK'):
            sample=sample+tmp+x['file_name'].split('_')[2]
        elif tmp in ['2nd','July','June','Wong']:
            sample=sample+tmp+x['file_name'].split('_')[2]+x['file_name'].split('_')[3]
    else:
        if x['file_name'].startswith('HK.') or x['file_name'].startswith('SW'):
            sample = '10X_'+x['project.project_core.project_short_name']+'_'+x['file_name'].split('.')[0]
    return sample

human_cell_ATLAS_metadata_fastq_final_final_alter['sample'] = human_cell_ATLAS_metadata_fastq_final_final_alter.apply(lambda x:get_sample(x),1)
In [135]:
NeMO_manifest_metadata_23017c0cfd['sample'] = NeMO_manifest_metadata_23017c0cfd.apply(lambda x:x['sample_id'][:3]+'_'+x['sample_id'][3:].replace('-','_').replace(';','_'),1)
In [136]:
tmp1 = pd.merge(human_cell_ATLAS_metadata_fastq_final_final_alter,merged_num_PAS_each_class[['sample','organ']],how='inner',on='sample')
tmp2 = pd.merge(NeMO_manifest_metadata_23017c0cfd,merged_num_PAS_each_class[['sample','organ']],how='inner',on='sample')
In [137]:
len(tmp2['sample'].unique())+len(tmp1['sample'].unique()),len(tmp1['sample'].unique()),len(tmp2['sample'].unique())
Out[137]:
(813, 722, 91)
In [138]:
tmp1[['sample','organ']+list(tmp1.columns[:-2])].to_csv('./HumanCellAtlas_match.tsv',sep=str('\t'),header=True,index=None,quoting=csv.QUOTE_NONE)
In [139]:
tmp2[['sample','organ']+list(tmp2.columns[:-2])].to_csv('./Nemo_match.tsv',sep=str('\t'),header=True,index=None,quoting=csv.QUOTE_NONE)
In [140]:
# new metadata from Young
final_filtered_human_metadata_240627 = pd.read_csv('./final_filtered_human_metadata_240627.csv',delimiter=",",index_col=None,header=0)
final_filtered_human_metadata_240627 = final_filtered_human_metadata_240627.loc[~final_filtered_human_metadata_240627['scinpas_sample'].isna()].reset_index(drop=True)
final_filtered_human_metadata_240627['DataBase'] = final_filtered_human_metadata_240627.apply(lambda x:'NeMO' if x['scinpas_organ']=='brain' else 'HCA',1)
In [141]:
HCA_full_manifest = pd.read_csv('/scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/hca-manifest-1d3efc71-5986-591b-bd99-45b65bce9b09.99950987-0051-50bf-9402-dc99918f99dc.tsv',delimiter="\t",index_col=None,header=0)
HCA_full_manifest['HCA_present'] = 1
In [142]:
test = pd.merge(final_filtered_human_metadata_240627,HCA_full_manifest[['bundle_uuid','HCA_present']],how='left',on='bundle_uuid')
matched = test.loc[((test['scinpas_organ']!='brain')&(~test['HCA_present'].isna()))|(test['scinpas_organ']=='brain')].reset_index(drop=True)
not_matched = test.loc[(test['scinpas_organ']!='brain')&(test['HCA_present'].isna())].reset_index(drop=True)
not_matched['match_by'] = not_matched['file_name'].str.replace('_001.fastq.gz','')
HCA_full_manifest['match_by'] = HCA_full_manifest['file_name'].str.replace('.fastq.gz','')
not_matched = pd.merge(not_matched.drop(['bundle_uuid','HCA_present'],1),HCA_full_manifest[['match_by','bundle_uuid','HCA_present']],how='left',on='match_by')
still_not_matched = not_matched.loc[not_matched['bundle_uuid'].isna()].reset_index(drop=True)
still_not_matched['bundle_uuid'] = 'NA'
now_matched = not_matched.loc[~not_matched['bundle_uuid'].isna()].reset_index(drop=True)

cols = ['scinpas_sample','DataBase','scinpas_organ']+['bundle_uuid',
 'file_name',
 'file_format',
 'file_size',
 'cell_suspension.selected_cell_type',
 'library_preparation_protocol.library_construction_approach',
 'library_preparation_protocol.nucleic_acid_source',
 'project.project_core.project_short_name',
 'specimen_from_organism.diseases',
 'specimen_from_organism.organ',
 'specimen_from_organism.organ_part',
 'donor_organism.biomaterial_core.biomaterial_id',
 'donor_organism.genus_species',
 'donor_organism.development_stage',
 'donor_organism.diseases',
 'donor_organism.organism_age',
 'sample.biomaterial_core.biomaterial_id','sequencing_quality']

final_metadata = pd.concat([matched[cols],now_matched[cols],still_not_matched[cols]]).reset_index(drop=True)
final_metadata = final_metadata.drop_duplicates(['DataBase','scinpas_organ','scinpas_sample','file_name']).sort_values(['DataBase','scinpas_organ','scinpas_sample']).reset_index(drop=True)
final_metadata['bundle_uuid'] = final_metadata.apply(lambda x:x['bundle_uuid'] if x['DataBase']=='HCA' else 'NA',1)
In [150]:
# rename tissues
tissue_rename_dict = {}
tissue_rename_dict['trachea']= 'tracheal epithelium'
tissue_rename_dict['nose']= 'nasal mucosa'
tissue_rename_dict['kidney']= 'kidney parenchyma'
tissue_rename_dict['intestine'] = 'intestine'
tissue_rename_dict['bone'] = 'intervertebral disc'
tissue_rename_dict['penis'] = 'corpus cavernosum'

tissue_map = {}
for tissue in list(final_metadata['scinpas_organ'].unique()):
    if tissue in list(tissue_rename_dict.keys()):
        tissue_map[tissue] = tissue_rename_dict[tissue]
    else:
        tissue_map[tissue] = tissue

final_metadata['scinpas_organ_new'] = final_metadata['scinpas_organ'].map(tissue_map)
final_metadata['scinpas_organ'] = final_metadata['scinpas_organ_new']
final_metadata = final_metadata.drop('scinpas_organ_new',1)
In [156]:
final_metadata.to_csv('/scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/sample_and_file_metadata.tsv',sep=str('\t'),header=True,index=None,quoting=csv.QUOTE_NONE)

mouse¶

In [62]:
mouse_metadata_final_filtered = pd.read_csv("/scicore/home/zavolan/GROUP/SCINPAS_catalog/mouse/polyAsite_Atlas_3/mouse_metadata_final_filtered.csv",delimiter=",",index_col=None,header=0)
mouse_metadata_final_filtered = mouse_metadata_final_filtered.rename(columns={'scinpas':'scinpas_sample','organ':'scinpas_organ'})
mouse_metadata_final_filtered['DataBase'] = 'Tabula Muris Senis'

official_metadata = pd.read_csv("/scicore/home/zavolan/GROUP/SCINPAS_catalog/mouse/polyAsite_Atlas_3/GSM4505404_tabula-muris-senis-droplet-official-raw-obj-metadata.csv.gz",delimiter=",",index_col=None,header=0)
official_metadata['sample'] = official_metadata.apply(lambda x:'_'.join(x['cell'].split('_')[:-1]),1)
official_metadata = official_metadata.drop_duplicates('sample').reset_index(drop=True)
mouse_metadata_final_filtered = pd.merge(mouse_metadata_final_filtered,official_metadata[['sample','age','mouse.id','sex','subtissue']],how='left',on='sample')
mouse_metadata_final_filtered = mouse_metadata_final_filtered[['scinpas_sample','DataBase','scinpas_organ','age','mouse.id','sex','subtissue']]
mouse_metadata_final_filtered.to_csv("/scicore/home/zavolan/GROUP/SCINPAS_catalog/mouse/polyAsite_Atlas_3/sample_and_file_metadata.tsv",sep=str('\t'),header=True,index=None,quoting=csv.QUOTE_NONE)

worm¶

In [81]:
worm_metadata_final_filtered = pd.read_csv("/scicore/home/zavolan/GROUP/SCINPAS_catalog/c_elegans/worm_metadata_final_filtered.csv",delimiter=",",index_col=None,header=0)
worm_metadata_final_filtered.columns = ['SRA_id','scinpas_sample','strain']
worm_metadata_final_filtered[['scinpas_sample','SRA_id','strain']].to_csv("/scicore/home/zavolan/GROUP/SCINPAS_catalog/c_elegans/polyAsite_Atlas_3/sample_and_file_metadata.tsv",sep=str('\t'),header=True,index=None,quoting=csv.QUOTE_NONE)

Random things¶

In [121]:
# make big bed file
a = []
for elem in list('chr'+pd.Series(range(1,22)).astype('str')):
    for i in range(0,10**6,100):
        a.append([elem,i,i+99])
bed_file = pd.DataFrame(a,columns = ['chr','start','end'])

bed_file['len'] = bed_file['end']-bed_file['start']
bed_file['cumul_len'] = bed_file['len'].cumsum(axis=0)
M = bed_file['len'].sum()
In [136]:
# make a uniform random sample
sample_size = 10**6
from scipy.stats import randint
r = randint.rvs(0, M, size=sample_size)
In [137]:
# now get the actual genomic coordinates
r.sort()
r = pd.Series(r)
In [132]:
genomic_positions = []
chromosomes = []
pred = 0
k=0 # iterator
start_time = time.time()
for index,row in bed_file.iterrows():
    r_sub = r.loc[(r<row['cumul_len'])&(r>=pred)]
    chr = row['chr']
    chromosomes = chromosomes+[chr]*len(r_sub)
    genomic_positions = genomic_positions+list(row['start']+r_sub-pred)
    pred = row['cumul_len']
    if k%5000==0:
        print(str(k)+' done, '+str(time.time()-start_time))
    k=k+1
0 done, 0.04459881782531738
5000 done, 8.946052551269531
10000 done, 18.63933253288269
15000 done, 29.077177047729492
20000 done, 40.23433971405029
25000 done, 52.232123613357544
30000 done, 64.98941254615784
35000 done, 78.56097292900085
40000 done, 93.05825996398926
45000 done, 108.46607065200806
50000 done, 125.21714735031128
55000 done, 142.62919116020203
60000 done, 161.10411477088928
65000 done, 180.62160897254944
70000 done, 201.32123613357544
75000 done, 222.88662695884705
80000 done, 245.30620408058167
85000 done, 269.0161051750183
90000 done, 294.012845993042
95000 done, 320.641884803772
100000 done, 348.86718225479126
105000 done, 377.6022136211395
110000 done, 407.78930020332336
115000 done, 439.3883044719696
120000 done, 472.32599997520447
125000 done, 506.6241524219513
130000 done, 542.3440065383911
135000 done, 579.6196844577789
140000 done, 618.4077925682068
145000 done, 658.4839911460876
150000 done, 699.9941833019257
155000 done, 742.6552038192749
160000 done, 786.8064415454865
165000 done, 832.4553668498993
170000 done, 879.4657661914825
175000 done, 927.979006767273
180000 done, 978.0892655849457
185000 done, 1029.5985431671143
190000 done, 1082.7922778129578
195000 done, 1136.6507608890533
200000 done, 1192.069009065628
205000 done, 1250.054283618927
In [ ]:
resulting_df = pd.DataFrame([chromosomes,genomic_positions]).transpose()
resulting_df.columns = ['chr','pos']
In [91]:
sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(1, 1, sharey=True, sharex=False,figsize=(5,5))

ax = sns.histplot(data = r,stat='proportion',bins=10)
ax.set(xlim=(0,M))
Out[91]:
[(0.0, 16600.0)]
No description has been provided for this image

Fraction of intergenic etc across samples and tissues¶

In [4]:
# merged_num_PAS_each_class_rpm = pd.read_csv('/scicore/home/zavolan/moon0000/intergenic_analysis_2/result/merged_num_PAS_each_class_rpm.csv',delimiter=",",index_col=None,header=0)
# merged_num_PAS_each_class_rpm['total'] = merged_num_PAS_each_class_rpm[['intronic', 'exonic', 'TE', 'true_intergenic',
#        'antisense_intronic', 'antisense_exonic', 'antisense_TE']].sum(1)

merged_num_PAS_each_class = pd.read_csv('./merged_num_PAS_each_class.csv',delimiter=",",index_col=None,header=0)
merged_num_PAS_each_class['total'] = merged_num_PAS_each_class[['intronic', 'exonic', 'TE', 'true_intergenic',
       'antisense_intronic', 'antisense_exonic', 'antisense_TE']].sum(1)
In [5]:
merged_num_PAS_each_class.head()
Out[5]:
sample organ intronic exonic TE true_intergenic antisense_intronic antisense_exonic antisense_TE total_PAS total
0 10X_131_1 brain 7323 939 2489 2076 1809 94 142 814610 14872
1 10X_131_2 brain 1677 294 1248 511 405 32 41 814610 4208
2 10X_131_3 brain 1258 231 1129 370 324 22 28 814610 3362
3 10X_131_4 brain 940 200 941 297 257 15 27 814610 2677
4 10X_131_7 brain 8052 948 2554 1891 1657 112 131 814610 15345
In [ ]:
 
In [3]:
lims = (0,6)

sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(1, 2, sharey=True, sharex=False,figsize=(10,5))

merged_num_PAS_each_class['total_log'] = np.log10(merged_num_PAS_each_class['total']+1)
merged_num_PAS_each_class_rpm['total_log'] = np.log10(merged_num_PAS_each_class_rpm['total']+1)
for category in ['antisense_TE','antisense_exonic','antisense_intronic','true_intergenic','exonic','intronic','TE']:
    merged_num_PAS_each_class[category+'_log'] = np.log10(merged_num_PAS_each_class[category]+1)
    merged_num_PAS_each_class[category+'_%'] = np.round(merged_num_PAS_each_class[category]/merged_num_PAS_each_class['total']*100,2)
    
    merged_num_PAS_each_class_rpm[category+'_%'] = np.round(merged_num_PAS_each_class_rpm[category]/merged_num_PAS_each_class_rpm['total']*100,2)
    
    ax0 = sns.regplot(ax=axes[0],data = merged_num_PAS_each_class,y=category+'_%',x='total_log',label = category,scatter_kws={'s':5})
    ax1 = sns.regplot(ax=axes[1],data = merged_num_PAS_each_class_rpm,y=category+'_%',x='total_log',label = category,scatter_kws={'s':5})

ax0.set(ylabel = '% in PAS class',xlabel = '# total supported PAS, $log_{10}$',title='# supported PAS\n(by a  least one read)')

ax1.legend(bbox_to_anchor=(1.05, 1),loc=2,borderaxespad=0,title='PAS class',markerscale=1.5,ncols=1,fontsize=9,mode=None)
ax1.set(ylabel = '',xlabel = '# total polyA reads, $log_{10}$',title='# polyA reads')
# ax.set(xlim=lims,ylim=lims)
Out[3]:
[Text(0, 0.5, ''),
 Text(0.5, 0, '# total polyA reads, $log_{10}$'),
 Text(0.5, 1.0, '# polyA reads')]
No description has been provided for this image
In [5]:
merged_num_PAS_each_class.head()
Out[5]:
sample organ intronic exonic TE true_intergenic antisense_intronic antisense_exonic antisense_TE total_PAS ... antisense_intronic_log antisense_intronic_% true_intergenic_log true_intergenic_% exonic_log exonic_% intronic_log intronic_% TE_log TE_%
0 10X_131_1 brain 7323 939 2489 2076 1809 94 142 814610 ... 3.257679 12.16 3.317436 13.96 2.973128 6.31 3.864748 49.24 3.396199 16.74
1 10X_131_2 brain 1677 294 1248 511 405 32 41 814610 ... 2.608526 9.62 2.709270 12.14 2.469822 6.99 3.224792 39.85 3.096562 29.66
2 10X_131_3 brain 1258 231 1129 370 324 22 28 814610 ... 2.511883 9.64 2.569374 11.01 2.365488 6.87 3.100026 37.42 3.053078 33.58
3 10X_131_4 brain 940 200 941 297 257 15 27 814610 ... 2.411620 9.60 2.474216 11.09 2.303196 7.47 2.973590 35.11 2.974051 35.15
4 10X_131_7 brain 8052 948 2554 1891 1657 112 131 814610 ... 3.219585 10.80 3.276921 12.32 2.977266 6.18 3.905958 52.47 3.407391 16.64

5 rows × 26 columns

In [4]:
merged_num_PAS_each_class_rpm.head()
Out[4]:
sample organ intronic exonic TE true_intergenic antisense_intronic antisense_exonic antisense_TE total_read total_PAS total total_log antisense_TE_% antisense_exonic_% antisense_intronic_% true_intergenic_% exonic_% intronic_% TE_%
0 10X_131_1 brain 10140.0 7843.0 48520.0 4233.0 3485.0 152.0 636.0 75009.0 NaN 75009.0 4.875119 0.85 0.20 4.65 5.64 10.46 13.52 64.69
1 10X_131_2 brain 1894.0 1364.0 7893.0 683.0 576.0 28.0 96.0 12534.0 NaN 12534.0 4.098124 0.77 0.22 4.60 5.45 10.88 15.11 62.97
2 10X_131_3 brain 1399.0 942.0 6219.0 528.0 381.0 16.0 77.0 9562.0 NaN 9562.0 3.980594 0.81 0.17 3.98 5.52 9.85 14.63 65.04
3 10X_131_4 brain 905.0 668.0 4342.0 389.0 268.0 18.0 59.0 6649.0 NaN 6649.0 3.822822 0.89 0.27 4.03 5.85 10.05 13.61 65.30
4 10X_131_7 brain 11564.0 7643.0 48780.0 4177.0 3044.0 156.0 704.0 76068.0 NaN 76068.0 4.881208 0.93 0.21 4.00 5.49 10.05 15.20 64.13
In [14]:
merged_num_PAS_each_class_rpm['project'] = merged_num_PAS_each_class_rpm['sample'].str.split('_',expand=True)[1]
merged_num_PAS_each_class_rpm['organ_x_project'] = merged_num_PAS_each_class_rpm['organ']+' '+merged_num_PAS_each_class_rpm['project']

gr2 = merged_num_PAS_each_class_rpm[['organ']].drop_duplicates().reset_index(drop=True)
gr2['color'] = list(sns.color_palette('husl',len(gr2)))

sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(1,5, sharey=False, sharex=False,figsize=(15,5))

gr = merged_num_PAS_each_class_rpm.groupby(['organ','project','organ_x_project','sample']).agg({'total_log':np.median}).reset_index()
gr1 = gr.groupby(['organ']).agg({'total_log':np.median}).reset_index().rename(columns={'total_log':'total_log_organ'})
gr1 = pd.merge(gr1,gr2,how='left',on='organ')
gr1 = gr1.sort_values('total_log_organ',ascending=False).reset_index(drop=True)

ax = sns.swarmplot(ax=axes[0],data = gr,y='organ',order = list(gr1['organ']), palette = list(gr1['color']), x='total_log',edgecolor='black',linewidth=0,s=2)
ax.set(xlabel = '# total polyA reads, $log_{10}$')

i=1
for category in ['true_intergenic','antisense_intronic','intronic','TE']:
    gr = merged_num_PAS_each_class_rpm.groupby(['organ','project','organ_x_project','sample']).agg({category+'_%':np.median}).reset_index()
    gr1 = gr.groupby(['organ']).agg({category+'_%':np.median}).reset_index().rename(columns={category+'_%':category+'_%_organ'})
    gr1 = pd.merge(gr1,gr2,how='left',on='organ')
    gr1 = gr1.sort_values(category+'_%_organ',ascending=False).reset_index(drop=True)
    
    gr = pd.merge(gr,gr1,how='left',on='organ')
    gr = pd.merge(gr,gr2,how='left',on='organ')
    gr = gr.sort_values([category+'_%_organ',category+'_%'],ascending=[False,False]).reset_index(drop=True)
    
    ax = sns.swarmplot(ax=axes[i],data = gr,y='organ',order = list(gr1['organ']), palette = list(gr1['color']), x=category+'_%',edgecolor='black',linewidth=0,s=2)
    ax.set(ylabel='')
    i=i+1
fig.tight_layout(pad=0.5)
No description has been provided for this image

Comparison with polyAsite 2 Atlas¶

In [22]:
merged_pas_motif_table = pd.read_csv('/scicore/home/zavolan/moon0000/intergenic_analysis_2/result/rcs_motif_check/merged_rcs_motif_phastcon_entropy.bed',delimiter="\t",index_col=None,header=0,usecols = [3,4,6,7,8,9,11,12,13,14,15,16,17,18,19,20,21,22])
merged_pas_motif_table['class'] = merged_pas_motif_table['class'].astype('category')
In [23]:
len(merged_pas_motif_table)
Out[23]:
18432135
In [24]:
SCINPAS_full = pd.read_csv('/scicore/home/zavolan/moon0000/GENE_ID/result/organ_score/pas_with_gene_id_v1.0.2_w_organ_score.bed',delimiter="\t",index_col=None,header=0)
In [29]:
tissues = ['nose', 'trachea', 'heart', 'intestine', 'breast', 'bone',
       'pancreas', 'eye', 'kidney', 'penis', 'ureter', 'lung', 'liver', 'skin',
       'prostate', 'uterus', 'bloodImmune', 'brain']
In [31]:
merged_pas_motif_table = pd.merge(SCINPAS_full[['seqid','start','end','id','score']+tissues].rename(columns={'score':'score_1'}),
         merged_pas_motif_table,how='inner',on=['id'])
In [32]:
merged_pas_motif_table = merged_pas_motif_table.drop('score',1).rename(columns={'score_1':'score'}) # score_1 from SCINPAS 
In [33]:
len(merged_pas_motif_table)
Out[33]:
18432135
In [35]:
cols = list(merged_pas_motif_table.columns)
motifs = cols[-15:-4]
In [51]:
merged_pas_motif_table[motifs] = merged_pas_motif_table[motifs].astype('boolean')

merged_pas_motif_table['any_canonic_motif'] = (merged_pas_motif_table[motifs].sum(1)>0).astype('int')
In [52]:
motifs_to_search = [elem.replace('U','T') for elem in motifs]
In [53]:
polyAsite = pd.read_csv('/scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas/atlas.clusters.2.0.GRCh38.96.bed',delimiter="\t",index_col=None,header=None)
In [54]:
def parse_motifs(x):
    if pd.isna(x[10]):
        return 0
    else:
        l = x[10].split(';')
        for motif in motifs_to_search:
            for elem in l:
                if elem.split('@')[0]==motif:
                    pos = int(elem.split('@')[1])
                    if pos>=-35 and pos<=-10:
                        return 1
    return 0

polyAsite['any_canonic_motif'] = polyAsite.apply(lambda x: parse_motifs(x),1)
In [55]:
Counter(polyAsite['any_canonic_motif'])
Out[55]:
Counter({1: 342526, 0: 226479})
In [83]:
(342526+226479), 342526/(342526+226479) # t_PAS and % motif in v2 polyAsite Atlas
Out[83]:
(569005, 0.601973620618448)
In [86]:
motif_thresholds = [0.6,0.65,0.7,0.75,0.8,0.85]

a = {}
for tissue in tissues:
    tmp = merged_pas_motif_table[['id',tissue,'any_canonic_motif']].sort_values(tissue,ascending=False).reset_index(drop=True)
    tmp['t']=1
    tmp['t_cumsum'] = tmp['t'].cumsum()
    tmp['any_canonic_motif_cumul'] = tmp['any_canonic_motif'].cumsum()
    tmp['frac_cumul'] = tmp['any_canonic_motif_cumul']/tmp['t_cumsum']
    
    for motif_threshold in motif_thresholds:
        max_index = max(tmp.loc[tmp['frac_cumul']>motif_threshold].index)
        pas_to_append = list(tmp.loc[0:max_index]['id'])
        if a.get(motif_threshold) is not None:
            a[motif_threshold] = a[motif_threshold]+pas_to_append
        else:
            a[motif_threshold] = pas_to_append
    print(tissue+' done')
res = {}
summary = []
for motif_threshold in motif_thresholds:
    tmp = pd.DataFrame(list(set(a[motif_threshold])),columns = ['id'])
    tmp = pd.merge(tmp,merged_pas_motif_table[['id','any_canonic_motif']],how='left',on='id')
    summary.append([motif_threshold,len(tmp),tmp['any_canonic_motif'].sum()/len(tmp)])
    print(str(motif_threshold)+' done')
summary = pd.DataFrame(summary,columns = ['motif_threshold','t_PAS','motif_fraction'])
nose done
trachea done
heart done
intestine done
breast done
bone done
pancreas done
eye done
kidney done
penis done
ureter done
lung done
liver done
skin done
prostate done
uterus done
bloodImmune done
brain done
0.6 done
0.65 done
0.7 done
0.75 done
0.8 done
0.85 done
In [87]:
summary.columns = ['within_tissue_motif_threshold','# PAS', 'fraction of PAS with motif after union']
In [88]:
summary
Out[88]:
within_tissue_motif_threshold # PAS fraction of PAS with motif after union
0 0.60 310975 0.451477
1 0.65 224565 0.494743
2 0.70 160240 0.545363
3 0.75 113060 0.606112
4 0.80 78177 0.675672
5 0.85 51491 0.756890

Quantify usage instead of RPM¶

In [2]:
merged_pas_motif_table = pd.read_csv('/scicore/home/zavolan/moon0000/intergenic_analysis_2/result/rcs_motif_check/merged_rcs_motif_phastcon_entropy.bed',delimiter="\t",index_col=None,header=0,usecols = [3,4,6,7,8,9,11,12,13,14,15,16,17,18,19,20,21,22])
merged_pas_motif_table['class'] = merged_pas_motif_table['class'].astype('category')
In [3]:
len(merged_pas_motif_table)
Out[3]:
18432135
In [4]:
cols = list(merged_pas_motif_table.columns)
motifs = cols[-15:-4]
motifs
Out[4]:
['AAUAAA',
 'AUUAAA',
 'UAUAAA',
 'AGUAAA',
 'AAUACA',
 'AAUAUA',
 'CAUAAA',
 'AAUGAA',
 'GAUAAA',
 'ACUAAA',
 'AAUAGA']
In [93]:
len(motifs)
Out[93]:
11
In [5]:
merged_pas_motif_table[motifs] = merged_pas_motif_table[motifs].astype('boolean')

merged_pas_motif_table['any_canonic_motif'] = (merged_pas_motif_table[motifs].sum(1)>0).astype('int')
In [6]:
SCINPAS_full = pd.read_csv('/scicore/home/zavolan/moon0000/GENE_ID/result/organ_score/pas_with_gene_id_v1.0.2_w_organ_score.bed',delimiter="\t",index_col=None,header=0)
In [7]:
tmp_dir = '/scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/'
out = subprocess.check_output('mkdir -p '+tmp_dir, shell=True)

SCINPAS_full['new_id'] = SCINPAS_full.index
SCINPAS_full['score_tmp'] = 1
SCINPAS_full[['seqid','start','end','new_id','score_tmp','strand']].to_csv(tmp_dir+'scinpas_full.bed',sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
In [8]:
'bedtools sort -i '+tmp_dir+'scinpas_full.bed > '+tmp_dir+'scinpas_full.sorted.bed'
Out[8]:
'bedtools sort -i /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/scinpas_full.bed > /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/scinpas_full.sorted.bed'
In [8]:
motif_info = pd.merge(SCINPAS_full[['id','new_id']],
         merged_pas_motif_table[['id','any_canonic_motif']],how='inner',on=['id'])
In [9]:
len(motif_info)
Out[9]:
18432135
In [7]:
# take GENCODE gtf, extract genes from there, and subset only non-overlapping genes
gtf = pd.read_csv('/scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38_v42/gencode.v42.annotation.gtf',delimiter="\t",index_col=None,header=None,skiprows=5)
genes = gtf.loc[gtf[2]=='gene'].reset_index(drop=True)
genes['gene_id'] = genes[8].str.split('gene_id "',expand=True)[1].str.split('"',expand=True)[0]
genes['start'] = genes[3]-1
genes['score_tmp'] = 1
genes = genes.drop_duplicates([0,3,4,6]).reset_index(drop=True)
In [10]:
genes[[0,'start',4,'gene_id','score_tmp',6]].to_csv(tmp_dir+'genes.bed',sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
In [11]:
'bedtools sort -i '+tmp_dir+'genes.bed > '+tmp_dir+'genes.sorted.bed'
Out[11]:
'bedtools sort -i /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/genes.bed > /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/genes.sorted.bed'
In [23]:
'bedtools cluster -d 1001 -s -i '+tmp_dir+'genes.sorted.bed > '+tmp_dir+'genes.clustered.bed'
Out[23]:
'bedtools cluster -d 1001 -s -i /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/genes.sorted.bed > /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/genes.clustered.bed'
In [29]:
genes_clustered = pd.read_csv(tmp_dir+'genes.clustered.bed',delimiter="\t",index_col=None,header=None)
genes_clustered['t']=1
gr = genes_clustered.groupby([6]).agg({'t':np.sum}).reset_index()

non_overlap_genes = pd.merge(genes_clustered,gr.loc[gr['t']==1][[6]].reset_index(drop=True),how='inner',on=[6])

plus = non_overlap_genes.loc[non_overlap_genes[5]=='+'].reset_index(drop=True)
minus = non_overlap_genes.loc[non_overlap_genes[5]=='-'].reset_index(drop=True)

plus[2] = plus[2]+1000 # add downstream 1 kb region to include
minus[1] = minus[1]-1000 # add downstream 1 kb region to include

non_overlap_genes = pd.concat([plus,minus]).reset_index(drop=True)
In [31]:
non_overlap_genes[[0,1,2,3,4,5]].to_csv(tmp_dir+'genes.non_overlap.bed',sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
In [28]:
'bedtools sort -i '+tmp_dir+'genes.non_overlap.bed > '+tmp_dir+'genes.non_overlap.sorted.bed'
Out[28]:
'bedtools sort -i /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/genes.non_overlap.bed > /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/genes.non_overlap.sorted.bed'
In [32]:
len(non_overlap_genes)
Out[32]:
39528
In [42]:
'bedtools intersect -sorted -f 1.0 -s -wa -wb -a '+tmp_dir+'scinpas_full.sorted.bed -b '+tmp_dir+'genes.non_overlap.sorted.bed | cut -f1,4,6,8,9,10 > '+tmp_dir+'intersection.bed'
Out[42]:
'bedtools intersect -sorted -f 1.0 -s -wa -wb -a /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/scinpas_full.sorted.bed -b /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/genes.non_overlap.sorted.bed | cut -f1,4,6,8,9,10 > /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/intersection.bed'
In [10]:
intersection = pd.read_csv('/scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/intersection.bed',delimiter="\t",index_col=None,header=None)
intersection.columns = ['chr','new_id','strand','gene_start','gene_end','gene_id']
intersection[['chr','strand']] = intersection[['chr','strand']].astype('category')
In [11]:
intersection['t']=1
gr = intersection.groupby('new_id').agg({'t':sum}).reset_index()
In [12]:
len(intersection)
Out[12]:
5388855
In [13]:
tissues = ['nose', 'trachea', 'heart', 'intestine', 'breast', 'bone',
       'pancreas', 'eye', 'kidney', 'penis', 'ureter', 'lung', 'liver', 'skin',
       'prostate', 'uterus', 'bloodImmune', 'brain']
In [14]:
df = pd.merge(intersection[['new_id','gene_id']],SCINPAS_full[['new_id','score','class']+tissues],how='inner',on='new_id')
In [15]:
df = pd.merge(df,motif_info[['new_id','any_canonic_motif']],how='inner',on='new_id')
In [16]:
len(df)
Out[16]:
5388855
In [17]:
df['class'] = df['class'].astype('category')

usage_input_cols = ['score']+tissues

group_dict = {}
for col in usage_input_cols:
    group_dict[col] = np.sum

gr = df.groupby('gene_id').agg(group_dict).reset_index()

col_to_rename = list(gr.columns[1:])
rename_dict = {}
for col in col_to_rename:
    rename_dict[col] = col+'_sum'
gr = gr.rename(columns = rename_dict)

df = pd.merge(df,gr,how='inner',on='gene_id')
In [18]:
denom_usage_input_cols = [elem+'_sum' for elem in usage_input_cols]
In [19]:
ratio_cols = [elem+'_ratio' for elem in usage_input_cols]

df[ratio_cols] = (df[usage_input_cols].values/(df[denom_usage_input_cols]+10**(-30)).values)
In [20]:
df['t']=1
gr = df.groupby('gene_id').agg({'t':sum}).reset_index()
df = pd.merge(df.drop('t',1),gr,how='inner',on='gene_id')
In [21]:
len(df)
Out[21]:
5388855
In [22]:
df = df.loc[df['t']>1].reset_index(drop=True)
In [23]:
len(df)
Out[23]:
5385702
In [24]:
len(df['t'].unique())
Out[24]:
1828
In [ ]:
# what would be the numbers for polyAsite Atlas v2 if we only consider these regions
In [33]:
polyAsite = pd.read_csv('/scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas/atlas.clusters.2.0.GRCh38.96.bed',delimiter="\t",index_col=None,header=None)
In [45]:
polyAsite_short = polyAsite[[0,1,2,3,4,5]]
polyAsite_short[4] = 1
polyAsite_short[0] = 'chr'+polyAsite_short[0].astype('str')
polyAsite_short = polyAsite_short.loc[polyAsite_short[0].isin(list(non_overlap_genes[0].unique()))].reset_index(drop=True)
polyAsite_short.to_csv(tmp_dir+'atlas_v2.bed',sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
In [48]:
'bedtools sort -i '+tmp_dir+'atlas_v2.bed > '+tmp_dir+'atlas_v2.sorted.bed'
Out[48]:
'bedtools sort -i /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/atlas_v2.bed > /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/atlas_v2.sorted.bed'
In [50]:
'bedtools intersect -sorted -f 1.0 -s -wa -a '+tmp_dir+'atlas_v2.sorted.bed'+' -b '+tmp_dir+'genes.non_overlap.sorted.bed > '+tmp_dir+'Atlas_v2.intersection.bed'
Out[50]:
'bedtools intersect -sorted -f 1.0 -s -wa -a /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/atlas_v2.sorted.bed -b /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/genes.non_overlap.sorted.bed > /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/Atlas_v2.intersection.bed'
In [51]:
polyAsite_subset = pd.read_csv(tmp_dir+'Atlas_v2.intersection.bed',delimiter="\t",index_col=None,header=None)
In [54]:
len(polyAsite),len(polyAsite.loc[polyAsite[3].isin(list(polyAsite_subset[3].unique()))])
Out[54]:
(569005, 195574)
In [55]:
polyAsite_subset = polyAsite.loc[polyAsite[3].isin(list(polyAsite_subset[3].unique()))].reset_index(drop=True)
In [58]:
motifs_to_search = [elem.replace('U','T') for elem in motifs]

def parse_motifs(x):
    if pd.isna(x[10]):
        return 0
    else:
        l = x[10].split(';')
        for motif in motifs_to_search:
            for elem in l:
                if elem.split('@')[0]==motif:
                    pos = int(elem.split('@')[1])
                    if pos>=-35 and pos<=-10:
                        return 1
    return 0

polyAsite_subset['any_canonic_motif'] = polyAsite_subset.apply(lambda x: parse_motifs(x),1)
In [60]:
len(polyAsite_subset),len(polyAsite_subset.loc[polyAsite_subset['any_canonic_motif']==1])/len(polyAsite_subset)
Out[60]:
(195574, 0.5822655363187336)
In [61]:
# these are the metrics to achieve
In [69]:
# check the performance of scores as they are on this smaller subset
motif_thresholds = [0.6,0.65,0.7,0.75,0.8,0.85]

a = {}
for tissue in tissues:
    tmp = df[['new_id',tissue,'any_canonic_motif']].sort_values(tissue,ascending=False).reset_index(drop=True)
    tmp['t']=1
    tmp['t_cumsum'] = tmp['t'].cumsum()
    tmp['any_canonic_motif_cumul'] = tmp['any_canonic_motif'].cumsum()
    tmp['frac_cumul'] = tmp['any_canonic_motif_cumul']/tmp['t_cumsum']
    
    for motif_threshold in motif_thresholds:
        max_index = max(tmp.loc[tmp['frac_cumul']>motif_threshold].index)
        pas_to_append = list(tmp.loc[0:max_index]['new_id'])
        if a.get(motif_threshold) is not None:
            a[motif_threshold] = a[motif_threshold]+pas_to_append
        else:
            a[motif_threshold] = pas_to_append
    print(tissue+' done')
res = {}
summary = []
for motif_threshold in motif_thresholds:
    tmp = pd.DataFrame(list(set(a[motif_threshold])),columns = ['new_id'])
    tmp = pd.merge(tmp,df[['new_id','any_canonic_motif']],how='left',on='new_id')
    summary.append([motif_threshold,len(tmp),tmp['any_canonic_motif'].sum()/len(tmp)])
    print(str(motif_threshold)+' done')
summary = pd.DataFrame(summary,columns = ['motif_threshold','t_PAS','motif_fraction'])
nose done
trachea done
heart done
intestine done
breast done
bone done
pancreas done
eye done
kidney done
penis done
ureter done
lung done
liver done
skin done
prostate done
uterus done
bloodImmune done
brain done
0.6 done
0.65 done
0.7 done
0.75 done
0.8 done
0.85 done
In [70]:
summary.columns = ['within_tissue_motif_threshold','# PAS', 'fraction of PAS with motif after union']
In [71]:
summary
Out[71]:
within_tissue_motif_threshold # PAS fraction of PAS with motif after union
0 0.60 162340 0.443606
1 0.65 119060 0.487443
2 0.70 87939 0.538828
3 0.75 63687 0.601300
4 0.80 44893 0.673557
5 0.85 30849 0.753606
In [72]:
195574/63687
Out[72]:
3.0708621853753515
In [75]:
# if not within each tissue
a = {}

tmp = df[['new_id','score','any_canonic_motif']].sort_values('score',ascending=False).reset_index(drop=True)
tmp['t']=1
tmp['t_cumsum'] = tmp['t'].cumsum()
tmp['any_canonic_motif_cumul'] = tmp['any_canonic_motif'].cumsum()
tmp['frac_cumul'] = tmp['any_canonic_motif_cumul']/tmp['t_cumsum']

for motif_threshold in motif_thresholds:
    max_index = max(tmp.loc[tmp['frac_cumul']>motif_threshold].index)
    pas_to_append = list(tmp.loc[0:max_index]['new_id'])
    if a.get(motif_threshold) is not None:
        a[motif_threshold] = a[motif_threshold]+pas_to_append
    else:
        a[motif_threshold] = pas_to_append
res = {}
summary = []
for motif_threshold in motif_thresholds:
    tmp = pd.DataFrame(list(set(a[motif_threshold])),columns = ['new_id'])
    tmp = pd.merge(tmp,df[['new_id','any_canonic_motif']],how='left',on='new_id')
    summary.append([motif_threshold,len(tmp),tmp['any_canonic_motif'].sum()/len(tmp)])
    print(str(motif_threshold)+' done')
summary = pd.DataFrame(summary,columns = ['motif_threshold','t_PAS','motif_fraction'])
summary
0.6 done
0.65 done
0.7 done
0.75 done
0.8 done
0.85 done
Out[75]:
motif_threshold t_PAS motif_fraction
0 0.60 48593 0.600004
1 0.65 38183 0.650001
2 0.70 30482 0.700020
3 0.75 24085 0.750010
4 0.80 18488 0.800032
5 0.85 13527 0.850004
In [76]:
195574/48593
Out[76]:
4.024736073096948
In [80]:
# add quantiles by number of PAS in the gene
# df['PAS_num_cat'] = pd.qcut(df['t'],q = 500)
df['PAS_num_cat'] = df['t']

motif_threshold = 0.6
feature = 'score'

def get_filtered_PAS(L,data,feature,motif_threshold,iterator):

    # 1. define expression quantiles and loop within them
    expr_feature = feature+'_sum'
    scoring_feature = feature+'_ratio'
    
    data['expr_cat'] = pd.qcut(data[expr_feature],q=10,duplicates='drop')
    for expr_cat in list(data['expr_cat'].unique()):
        tmp = data.loc[data['expr_cat']==expr_cat].reset_index(drop=True)
        tmp = tmp[['new_id',scoring_feature,'any_canonic_motif']].sort_values(scoring_feature,ascending=False).reset_index(drop=True)
        tmp['t']=1
        tmp['t_cumsum'] = tmp['t'].cumsum()
        tmp['any_canonic_motif_cumul'] = tmp['any_canonic_motif'].cumsum()
        tmp['frac_cumul'] = tmp['any_canonic_motif_cumul']/tmp['t_cumsum']
        if tmp['frac_cumul'].max()>=motif_threshold:
            max_index = max(tmp.loc[tmp['frac_cumul']>=motif_threshold].index)
            pas_to_append = list(tmp.loc[0:max_index]['new_id'])
            L.append(pas_to_append)
        
    if iterator%5==0:
        print(str(iterator)+' done, '+str(time.time()-start_time))     
    
start_time = time.time()

with Manager() as manager:
    L = manager.list()
    processes = []
    i=0
    for PAS_num_cat in list(df['PAS_num_cat'].unique()):
        data = df.loc[df['PAS_num_cat']==PAS_num_cat].reset_index(drop=True)
        p = Process(target=get_filtered_PAS, args=(L,data,feature,motif_threshold,i))  # Passing the list
        p.start()
        processes.append(p)
        i=i+1
    for p in processes:
        p.join()
    L = list(L)
flattened = []
for elem in L:
    flattened = flattened+elem
flattened = list(set(flattened))
0 done, 0.2471604347229004
5 done, 0.6588912010192871
10 done, 1.0607943534851074
15 done, 1.4608628749847412
20 done, 1.8512182235717773
25 done, 2.256883382797241
30 done, 2.683002471923828
35 done, 3.0614254474639893
40 done, 3.464388847351074
45 done, 3.867133855819702
50 done, 4.2730114459991455
55 done, 4.648865461349487
60 done, 5.08697772026062
65 done, 5.479453802108765
70 done, 5.8689141273498535
75 done, 6.2871129512786865
80 done, 6.686724662780762
85 done, 7.074393272399902
90 done, 7.4492998123168945
95 done, 7.856955289840698
100 done, 8.238194465637207
105 done, 8.64706540107727
110 done, 9.030844926834106
115 done, 9.443076610565186
120 done, 9.844780921936035
125 done, 10.239341974258423
130 done, 10.628443717956543
135 done, 11.011773824691772
140 done, 11.42781114578247
145 done, 11.8036630153656
150 done, 12.17158031463623
155 done, 12.577524662017822
160 done, 12.946401119232178
165 done, 13.345419645309448
170 done, 13.738389492034912
175 done, 14.143964529037476
180 done, 14.518626928329468
185 done, 14.90165662765503
190 done, 15.331796884536743
195 done, 15.73563838005066
200 done, 16.135384559631348
205 done, 16.53104329109192
210 done, 16.907857656478882
215 done, 17.326249837875366
220 done, 17.73921489715576
225 done, 18.1227707862854
230 done, 18.50727081298828
235 done, 18.90893530845642
240 done, 19.289853811264038
245 done, 19.69845747947693
250 done, 20.077762365341187
255 done, 20.459183931350708
260 done, 20.863707065582275
265 done, 21.253548622131348
270 done, 21.65330195426941
275 done, 22.085458517074585
280 done, 22.451693058013916
285 done, 22.84962296485901
290 done, 23.227025032043457
295 done, 23.6243577003479
300 done, 24.011998653411865
305 done, 24.41333293914795
310 done, 24.796615839004517
315 done, 25.197178840637207
320 done, 25.599594593048096
325 done, 25.983662843704224
330 done, 26.36910319328308
335 done, 26.7863347530365
340 done, 27.162992238998413
345 done, 27.538270711898804
350 done, 27.912529230117798
355 done, 28.312389612197876
360 done, 28.67923331260681
365 done, 29.084048748016357
370 done, 29.494255542755127
375 done, 29.85740375518799
380 done, 30.232964754104614
385 done, 30.61388397216797
390 done, 30.996392488479614
395 done, 31.379972457885742
400 done, 31.761265993118286
405 done, 32.14472007751465
410 done, 32.52274680137634
415 done, 32.928714752197266
420 done, 33.302762508392334
425 done, 33.69055366516113
430 done, 34.077784061431885
435 done, 34.47099709510803
440 done, 34.842801570892334
445 done, 35.241976261138916
450 done, 35.63378548622131
455 done, 36.02464771270752
460 done, 36.43061399459839
465 done, 36.78756785392761
470 done, 37.20529890060425
475 done, 37.58163619041443
480 done, 37.98228168487549
485 done, 38.38580346107483
490 done, 38.763463497161865
495 done, 39.167964458465576
500 done, 39.57504725456238
505 done, 39.962172985076904
510 done, 40.37232208251953
515 done, 40.72806191444397
520 done, 41.1547064781189
525 done, 41.500030517578125
530 done, 41.88764786720276
535 done, 42.276387453079224
540 done, 42.66238570213318
545 done, 43.06632137298584
550 done, 43.41918349266052
555 done, 43.84536623954773
560 done, 44.23313283920288
565 done, 44.61800694465637
570 done, 44.98036813735962
575 done, 45.401899099349976
580 done, 45.763211488723755
585 done, 46.161136865615845
590 done, 46.52697801589966
595 done, 46.93542218208313
600 done, 47.320653438568115
605 done, 47.72657775878906
610 done, 48.08842325210571
615 done, 48.48499584197998
620 done, 48.886828660964966
625 done, 49.2636935710907
630 done, 49.62689685821533
635 done, 50.014978647232056
640 done, 50.40575194358826
645 done, 50.784011363983154
650 done, 51.18371343612671
655 done, 51.571752309799194
660 done, 51.93593764305115
665 done, 52.334174394607544
670 done, 52.7110710144043
675 done, 53.0884952545166
680 done, 53.50185513496399
685 done, 53.885148763656616
690 done, 54.26203751564026
695 done, 54.63493061065674
700 done, 55.023661613464355
705 done, 55.389198541641235
710 done, 55.76806855201721
715 done, 56.14607501029968
720 done, 56.551657915115356
725 done, 56.93309020996094
730 done, 57.30261588096619
735 done, 57.6970489025116
740 done, 58.11862564086914
745 done, 58.47442317008972
750 done, 58.85162162780762
755 done, 59.250535011291504
760 done, 59.627315521240234
765 done, 60.023086071014404
770 done, 60.39529895782471
775 done, 60.779624462127686
780 done, 61.16417098045349
785 done, 61.54671096801758
790 done, 61.9278347492218
795 done, 62.317710161209106
800 done, 62.712127685546875
805 done, 63.08165884017944
810 done, 63.475751876831055
815 done, 63.857280254364014
820 done, 64.22975277900696
825 done, 64.61401748657227
830 done, 65.0001471042633
835 done, 65.38176822662354
840 done, 65.77804565429688
845 done, 66.17784404754639
850 done, 66.53648948669434
855 done, 66.92903685569763
860 done, 67.31445503234863
865 done, 67.72422313690186
870 done, 68.0761365890503
875 done, 68.48452425003052
880 done, 68.87054109573364
885 done, 69.25377488136292
890 done, 69.64117527008057
895 done, 70.04185366630554
900 done, 70.4283287525177
905 done, 70.8021388053894
910 done, 71.19557476043701
915 done, 71.5963990688324
920 done, 71.977942943573
925 done, 72.37270736694336
930 done, 72.74741411209106
935 done, 73.14815425872803
940 done, 73.52707505226135
945 done, 73.90832018852234
950 done, 74.29949593544006
955 done, 74.68591737747192
960 done, 75.06726360321045
965 done, 75.45162081718445
970 done, 75.83878898620605
975 done, 76.2189781665802
980 done, 76.61057901382446
985 done, 76.9794180393219
990 done, 77.39806008338928
995 done, 77.76587128639221
1000 done, 78.14691424369812
1005 done, 78.51626181602478
1010 done, 78.9102246761322
1015 done, 79.28442621231079
1020 done, 79.65670680999756
1025 done, 80.05499053001404
1030 done, 80.42851424217224
1035 done, 80.81850171089172
1040 done, 81.19444751739502
1045 done, 81.5593330860138
1050 done, 81.95001459121704
1055 done, 82.3365957736969
1060 done, 82.72114825248718
1065 done, 83.09857678413391
1070 done, 83.4881739616394
1075 done, 83.86461305618286
1080 done, 84.24799990653992
1085 done, 84.62825465202332
1090 done, 85.01897430419922
1095 done, 85.38664197921753
1100 done, 85.78169631958008
1105 done, 86.15666961669922
1110 done, 86.5611732006073
1115 done, 86.94381070137024
1120 done, 87.34171795845032
1125 done, 87.71135997772217
1130 done, 88.10237002372742
1135 done, 88.49527525901794
1140 done, 88.88247156143188
1145 done, 89.26415753364563
1150 done, 89.64172196388245
1155 done, 90.02797794342041
1160 done, 90.4079098701477
1165 done, 90.78547263145447
1170 done, 91.17601299285889
1175 done, 91.55146670341492
1180 done, 91.93451476097107
1185 done, 92.32014036178589
1190 done, 92.69587135314941
1195 done, 93.08174061775208
1200 done, 93.46492028236389
1205 done, 93.8482985496521
1210 done, 94.23295974731445
1215 done, 94.60936951637268
1220 done, 95.0038959980011
1225 done, 95.38508462905884
1230 done, 95.76945805549622
1235 done, 96.15371298789978
1240 done, 96.52788472175598
1245 done, 96.90053844451904
1250 done, 97.27894330024719
1255 done, 97.66366076469421
1260 done, 98.04296684265137
1265 done, 98.43003749847412
1270 done, 98.81429862976074
1275 done, 99.20108771324158
1280 done, 99.57811784744263
1285 done, 99.96428442001343
1290 done, 100.33957052230835
1295 done, 100.72369456291199
1300 done, 101.09698104858398
1305 done, 101.47606587409973
1310 done, 101.85790014266968
1315 done, 102.24446177482605
1320 done, 102.63164377212524
1325 done, 103.01276230812073
1330 done, 103.40927982330322
1335 done, 103.80992794036865
1340 done, 104.20266771316528
1345 done, 104.57926177978516
1350 done, 104.98246169090271
1355 done, 105.36634707450867
1360 done, 105.74168968200684
1365 done, 106.12457513809204
1370 done, 106.51714992523193
1375 done, 106.89436912536621
1380 done, 107.27085256576538
1385 done, 107.66155433654785
1390 done, 108.028315782547
1395 done, 108.41190671920776
1400 done, 108.80394220352173
1405 done, 109.17514181137085
1410 done, 109.56265091896057
1415 done, 109.95081114768982
1420 done, 110.33561420440674
1425 done, 110.71273994445801
1430 done, 111.09171772003174
1435 done, 111.47609615325928
1440 done, 111.85429739952087
1445 done, 112.23667860031128
1450 done, 112.61955165863037
1455 done, 112.99793148040771
1460 done, 113.38851833343506
1465 done, 113.76855301856995
1470 done, 114.1660807132721
1475 done, 114.5475754737854
1480 done, 114.92905402183533
1485 done, 115.31338858604431
1490 done, 115.68027997016907
1495 done, 116.06084060668945
1500 done, 116.44997668266296
1505 done, 116.83036518096924
1510 done, 117.21098399162292
1515 done, 117.59425711631775
1520 done, 117.96703767776489
1525 done, 118.33863472938538
1530 done, 118.72356271743774
1535 done, 119.1332643032074
1540 done, 119.48778557777405
1545 done, 119.86788535118103
1550 done, 120.24672412872314
1555 done, 120.6217429637909
1560 done, 120.99695110321045
1565 done, 121.37977004051208
1570 done, 121.77221918106079
1575 done, 122.17509388923645
1580 done, 122.55719995498657
1585 done, 122.93667984008789
1590 done, 123.35222458839417
1595 done, 123.74211883544922
1600 done, 124.12389636039734
1605 done, 124.5122582912445
1610 done, 124.8997015953064
1615 done, 125.28213596343994
1620 done, 125.68218231201172
1625 done, 126.08383679389954
1630 done, 126.47371482849121
1635 done, 126.84444046020508
1640 done, 127.23132658004761
1645 done, 127.61368918418884
1650 done, 127.99730610847473
1655 done, 128.3777027130127
1660 done, 128.75566053390503
1665 done, 129.14171528816223
1670 done, 129.5226104259491
1675 done, 129.9076681137085
1680 done, 130.2842676639557
1685 done, 130.67197632789612
1690 done, 131.0692653656006
1695 done, 131.43784141540527
1700 done, 131.80851578712463
1705 done, 132.19280290603638
1710 done, 132.5805115699768
1715 done, 132.9616162776947
1720 done, 133.34007048606873
1725 done, 133.7207226753235
1730 done, 134.11619019508362
1735 done, 134.49426531791687
1740 done, 134.90068554878235
1745 done, 135.2692232131958
1750 done, 135.65832996368408
1755 done, 136.03240180015564
1760 done, 136.41043710708618
1765 done, 136.80464482307434
1770 done, 137.2087528705597
1775 done, 137.58966326713562
1780 done, 137.96461987495422
1785 done, 138.34827375411987
1790 done, 138.72814297676086
1795 done, 139.11354804039001
1800 done, 139.5034441947937
1805 done, 139.89383268356323
1810 done, 140.2939121723175
1815 done, 140.6767635345459
1820 done, 141.06733393669128
1825 done, 141.45396995544434
In [81]:
len(flattened)
Out[81]:
86567
In [82]:
tmp = pd.DataFrame(flattened,columns = ['new_id'])
tmp = pd.merge(tmp,df[['new_id','any_canonic_motif']],how='left',on='new_id')
motif_threshold,len(tmp),tmp['any_canonic_motif'].sum()/len(tmp)
Out[82]:
(0.6, 86567, 0.6127161620478936)
In [90]:
len(df.loc[df['t']<4])/len(df)
Out[90]:
0.0019937976516339003
In [92]:
sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(1,1, sharey=False, sharex=True,figsize=(3,3))

ax = sns.histplot(df['t'],stat='density')
ax.set(xlabel = '# PAS in the gene')
Out[92]:
[Text(0.5, 0, '# PAS in the gene')]
No description has been provided for this image
In [ ]:
 
In [84]:
# let's try within each tissue

# add quantiles by number of PAS in the gene
# df['PAS_num_cat'] = pd.qcut(df['t'],q = 100)
df['PAS_num_cat'] = df['t']

motif_threshold = 0.72

def get_filtered_PAS(L,data,feature,motif_threshold,iterator_i,iterator_j):

    # 1. define expression quantiles and loop within them
    expr_feature = feature+'_sum'
    scoring_feature = feature+'_ratio'
    
    data['expr_cat'] = pd.qcut(data[expr_feature],q=10,duplicates='drop')
    for expr_cat in list(data['expr_cat'].unique()):
        tmp = data.loc[data['expr_cat']==expr_cat].reset_index(drop=True)
        tmp = tmp[['new_id',scoring_feature,'any_canonic_motif']].sort_values(scoring_feature,ascending=False).reset_index(drop=True)
        tmp['t']=1
        tmp['t_cumsum'] = tmp['t'].cumsum()
        tmp['any_canonic_motif_cumul'] = tmp['any_canonic_motif'].cumsum()
        tmp['frac_cumul'] = tmp['any_canonic_motif_cumul']/tmp['t_cumsum']
        if tmp['frac_cumul'].max()>=motif_threshold:
            max_index = max(tmp.loc[tmp['frac_cumul']>=motif_threshold].index)
            pas_to_append = list(tmp.loc[0:max_index]['new_id'])
            L.append(pas_to_append)
        
    if iterator_i%5==0 and iterator_j%5==0:
        print(str(iterator_i)+','+str(iterator_j)+' done, '+str(time.time()-start_time))     
    
start_time = time.time()

with Manager() as manager:
    L = manager.list()
    processes = []
    i=0
    for PAS_num_cat in list(df['PAS_num_cat'].unique()):
        data = df.loc[df['PAS_num_cat']==PAS_num_cat].reset_index(drop=True)
        j=0
        for feature in tissues:
            p = Process(target=get_filtered_PAS, args=(L,data,feature,motif_threshold,i,j))  # Passing the list
            p.start()
            processes.append(p)
            j=j+1
        i=i+1
    for p in processes:
        p.join()
    L = list(L)
flattened = []
for elem in L:
    flattened = flattened+elem
flattened = list(set(flattened))
0,0 done, 0.24164724349975586
0,5 done, 0.5212569236755371
0,10 done, 0.81752610206604
0,15 done, 1.0853595733642578
5,0 done, 5.4705047607421875
5,5 done, 5.761173963546753
5,10 done, 6.051577568054199
5,15 done, 6.333654165267944
10,0 done, 10.630130052566528
10,5 done, 10.926160097122192
10,10 done, 11.209879159927368
10,15 done, 11.477243185043335
15,0 done, 15.771695137023926
15,5 done, 16.049487352371216
15,10 done, 16.328447580337524
15,15 done, 16.602094173431396
20,0 done, 20.91411256790161
20,5 done, 21.203969955444336
20,10 done, 21.489899158477783
20,15 done, 21.76624894142151
25,0 done, 26.057672262191772
25,5 done, 26.322872400283813
25,10 done, 26.613312482833862
25,15 done, 26.892600297927856
30,0 done, 31.18018126487732
30,5 done, 31.467642307281494
30,10 done, 31.741339206695557
30,15 done, 32.03318119049072
35,0 done, 36.32417607307434
35,5 done, 36.602927923202515
35,10 done, 36.88266682624817
35,15 done, 37.15081572532654
40,0 done, 41.440335273742676
40,5 done, 41.73688077926636
40,10 done, 42.00178623199463
40,15 done, 42.289522647857666
45,0 done, 46.640711545944214
45,5 done, 46.92738127708435
45,10 done, 47.2142596244812
45,15 done, 47.48786950111389
50,0 done, 51.81929898262024
50,5 done, 52.08564281463623
50,10 done, 52.373751401901245
50,15 done, 52.652676820755005
55,0 done, 56.91909384727478
55,5 done, 57.198766231536865
55,10 done, 57.471991300582886
55,15 done, 57.749422788619995
60,0 done, 62.03436732292175
60,5 done, 62.318875789642334
60,10 done, 62.57627630233765
60,15 done, 62.85427713394165
65,0 done, 67.16364550590515
65,5 done, 67.45202422142029
65,10 done, 67.71409487724304
65,15 done, 67.99264407157898
70,0 done, 72.27025294303894
70,5 done, 72.56512904167175
70,10 done, 72.84230852127075
70,15 done, 73.12736582756042
75,0 done, 77.39294958114624
75,5 done, 77.67104125022888
75,10 done, 77.94792890548706
75,15 done, 78.22971296310425
80,0 done, 82.5317018032074
80,5 done, 82.80428385734558
80,10 done, 83.08894920349121
80,15 done, 83.39330959320068
85,0 done, 87.64873123168945
85,5 done, 87.93561434745789
85,10 done, 88.21007251739502
85,15 done, 88.48969030380249
90,0 done, 92.78563284873962
90,5 done, 93.08458256721497
90,10 done, 93.33713006973267
90,15 done, 93.61748552322388
95,0 done, 98.00365352630615
95,5 done, 98.26403546333313
95,10 done, 98.54574513435364
95,15 done, 98.84749341011047
100,0 done, 103.11708068847656
100,5 done, 103.39178586006165
100,10 done, 103.67576503753662
100,15 done, 103.96727585792542
105,0 done, 108.26972770690918
105,5 done, 108.56708693504333
105,10 done, 108.83837223052979
105,15 done, 109.10081624984741
110,0 done, 113.41192603111267
110,5 done, 113.70247960090637
110,10 done, 113.98627209663391
110,15 done, 114.27441096305847
115,0 done, 118.64116287231445
115,5 done, 118.93541550636292
115,10 done, 119.1908106803894
115,15 done, 119.45669746398926
120,0 done, 123.63670516014099
120,5 done, 123.8972225189209
120,10 done, 124.18350720405579
120,15 done, 124.43578362464905
125,0 done, 128.60830187797546
125,5 done, 128.87898921966553
125,10 done, 129.1560435295105
125,15 done, 129.40236401557922
130,0 done, 133.5451786518097
130,5 done, 133.82622599601746
130,10 done, 134.08655524253845
130,15 done, 134.36100339889526
135,0 done, 138.4590892791748
135,5 done, 138.75429821014404
135,10 done, 139.02001309394836
135,15 done, 139.29079627990723
140,0 done, 143.39318323135376
140,5 done, 143.69177794456482
140,10 done, 143.96799731254578
140,15 done, 144.23493552207947
145,0 done, 148.43969774246216
145,5 done, 148.71066999435425
145,10 done, 148.9861717224121
145,15 done, 149.24526119232178
150,0 done, 153.36662650108337
150,5 done, 153.64371848106384
150,10 done, 153.91676712036133
150,15 done, 154.18864917755127
155,0 done, 158.43565392494202
155,5 done, 158.70899367332458
155,10 done, 158.97197914123535
155,15 done, 159.2377965450287
160,0 done, 163.3858962059021
160,5 done, 163.65737318992615
160,10 done, 163.93180561065674
160,15 done, 164.21466994285583
165,0 done, 168.39101266860962
165,5 done, 168.6672146320343
165,10 done, 168.9538815021515
165,15 done, 169.22979140281677
170,0 done, 173.37977027893066
170,5 done, 173.6368486881256
170,10 done, 173.92251873016357
170,15 done, 174.19756603240967
175,0 done, 178.31804513931274
175,5 done, 178.60632276535034
175,10 done, 178.87167501449585
175,15 done, 179.1398069858551
180,0 done, 183.30749940872192
180,5 done, 183.58507657051086
180,10 done, 183.8646092414856
180,15 done, 184.13516879081726
185,0 done, 188.29443383216858
185,5 done, 188.56792187690735
185,10 done, 188.84500741958618
185,15 done, 189.12130451202393
190,0 done, 193.30440855026245
190,5 done, 193.59757494926453
190,10 done, 193.8599410057068
190,15 done, 194.11533164978027
195,0 done, 198.30212664604187
195,5 done, 198.55965900421143
195,10 done, 198.82527256011963
195,15 done, 199.0986783504486
200,0 done, 203.30581951141357
200,5 done, 203.5724618434906
200,10 done, 203.85144686698914
200,15 done, 204.1353521347046
205,0 done, 208.303231716156
205,5 done, 208.55742740631104
205,10 done, 208.84722137451172
205,15 done, 209.10077214241028
210,0 done, 213.27168798446655
210,5 done, 213.55215764045715
210,10 done, 213.8308389186859
210,15 done, 214.09841299057007
215,0 done, 218.29819059371948
215,5 done, 218.55914211273193
215,10 done, 218.83475255966187
215,15 done, 219.1120729446411
220,0 done, 223.3249795436859
220,5 done, 223.6068286895752
220,10 done, 223.87664270401
220,15 done, 224.1399097442627
225,0 done, 228.36349821090698
225,5 done, 228.63332986831665
225,10 done, 228.89439010620117
225,15 done, 229.25585460662842
230,0 done, 233.55262327194214
230,5 done, 233.8238444328308
230,10 done, 234.1181252002716
230,15 done, 234.37919402122498
235,0 done, 238.5248007774353
235,5 done, 238.79728317260742
235,10 done, 239.06859064102173
235,15 done, 239.35625910758972
240,0 done, 243.51157665252686
240,5 done, 243.7708761692047
240,10 done, 244.04712796211243
240,15 done, 244.32174253463745
245,0 done, 248.5042643547058
245,5 done, 248.76883506774902
245,10 done, 249.03928089141846
245,15 done, 249.32917380332947
250,0 done, 253.5294153690338
250,5 done, 253.80415177345276
250,10 done, 254.06284022331238
250,15 done, 254.35352611541748
255,0 done, 258.5487151145935
255,5 done, 258.82013034820557
255,10 done, 259.09327578544617
255,15 done, 259.3798871040344
260,0 done, 263.5269412994385
260,5 done, 263.797399520874
260,10 done, 264.0839650630951
260,15 done, 264.3481044769287
265,0 done, 268.5343379974365
265,5 done, 268.8143243789673
265,10 done, 269.0817859172821
265,15 done, 269.3701205253601
270,0 done, 273.5129625797272
270,5 done, 273.7865467071533
270,10 done, 274.06920075416565
270,15 done, 274.35462403297424
275,0 done, 278.50887989997864
275,5 done, 278.79722023010254
275,10 done, 279.07101917266846
275,15 done, 279.3437204360962
280,0 done, 283.5157153606415
280,5 done, 283.80007004737854
280,10 done, 284.0703332424164
280,15 done, 284.34508299827576
285,0 done, 288.5400755405426
285,5 done, 288.82456374168396
285,10 done, 289.09572172164917
285,15 done, 289.3652627468109
290,0 done, 293.4575197696686
290,5 done, 293.73136162757874
290,10 done, 294.00155901908875
290,15 done, 294.2786808013916
295,0 done, 298.4423358440399
295,5 done, 298.7234516143799
295,10 done, 298.98983240127563
295,15 done, 299.2665162086487
300,0 done, 303.42761850357056
300,5 done, 303.6991219520569
300,10 done, 303.9754412174225
300,15 done, 304.2584960460663
305,0 done, 308.42584013938904
305,5 done, 308.70153164863586
305,10 done, 308.976345539093
305,15 done, 309.25114154815674
310,0 done, 313.4364240169525
310,5 done, 313.7071797847748
310,10 done, 313.9830791950226
310,15 done, 314.27019119262695
315,0 done, 318.4357695579529
315,5 done, 318.71538734436035
315,10 done, 318.9992172718048
315,15 done, 319.2796595096588
320,0 done, 323.44446897506714
320,5 done, 323.7331793308258
320,10 done, 323.9742908477783
320,15 done, 324.2472264766693
325,0 done, 328.39719438552856
325,5 done, 328.6690971851349
325,10 done, 328.94412422180176
325,15 done, 329.2228088378906
330,0 done, 333.3870105743408
330,5 done, 333.6663155555725
330,10 done, 333.93312907218933
330,15 done, 334.20839262008667
335,0 done, 338.39019799232483
335,5 done, 338.6809298992157
335,10 done, 338.9510474205017
335,15 done, 339.2087388038635
340,0 done, 343.4102487564087
340,5 done, 343.68486523628235
340,10 done, 343.965167760849
340,15 done, 344.2312169075012
345,0 done, 348.472149848938
345,5 done, 348.7231192588806
345,10 done, 349.012327671051
345,15 done, 349.28609442710876
350,0 done, 353.46914744377136
350,5 done, 353.74036693573
350,10 done, 354.01755380630493
350,15 done, 354.2918860912323
355,0 done, 358.5105490684509
355,5 done, 358.76985931396484
355,10 done, 359.04974389076233
355,15 done, 359.3324444293976
360,0 done, 363.5048487186432
360,5 done, 363.7737581729889
360,10 done, 364.0433859825134
360,15 done, 364.3175559043884
365,0 done, 368.5716965198517
365,5 done, 368.843279838562
365,10 done, 369.1293594837189
365,15 done, 369.4007685184479
370,0 done, 373.6319532394409
370,5 done, 373.8940975666046
370,10 done, 374.17779183387756
370,15 done, 374.44702672958374
375,0 done, 378.6593904495239
375,5 done, 378.9323399066925
375,10 done, 379.2021381855011
375,15 done, 379.48533844947815
380,0 done, 383.6702415943146
380,5 done, 383.94545817375183
380,10 done, 384.22287130355835
380,15 done, 384.4988696575165
385,0 done, 388.69513869285583
385,5 done, 388.96689343452454
385,10 done, 389.2469696998596
385,15 done, 389.51947593688965
390,0 done, 393.7057538032532
390,5 done, 393.96474170684814
390,10 done, 394.2424404621124
390,15 done, 394.53152203559875
395,0 done, 398.75367045402527
395,5 done, 399.0276517868042
395,10 done, 399.27933835983276
395,15 done, 399.5656735897064
400,0 done, 403.77329754829407
400,5 done, 404.04234313964844
400,10 done, 404.3308811187744
400,15 done, 404.601763010025
405,0 done, 408.78511667251587
405,5 done, 409.06988739967346
405,10 done, 409.3475093841553
405,15 done, 409.6308686733246
410,0 done, 413.8238501548767
410,5 done, 414.10115122795105
410,10 done, 414.3725814819336
410,15 done, 414.6550180912018
415,0 done, 418.82885575294495
415,5 done, 419.08973932266235
415,10 done, 419.35843229293823
415,15 done, 419.6428337097168
420,0 done, 423.8292908668518
420,5 done, 424.1023871898651
420,10 done, 424.382611989975
420,15 done, 424.66775131225586
425,0 done, 428.86771273612976
425,5 done, 429.1471335887909
425,10 done, 429.422523021698
425,15 done, 429.7044162750244
430,0 done, 433.91064167022705
430,5 done, 434.1735727787018
430,10 done, 434.46255826950073
430,15 done, 434.739470243454
435,0 done, 438.8892867565155
435,5 done, 439.16989159584045
435,10 done, 439.4443418979645
435,15 done, 439.7211456298828
440,0 done, 443.89678859710693
440,5 done, 444.17357993125916
440,10 done, 444.4485948085785
440,15 done, 444.72360610961914
445,0 done, 448.9459762573242
445,5 done, 449.22178292274475
445,10 done, 449.5010802745819
445,15 done, 449.7735559940338
450,0 done, 454.02100229263306
450,5 done, 454.2927129268646
450,10 done, 454.5714707374573
450,15 done, 454.8255846500397
455,0 done, 459.09684801101685
455,5 done, 459.3705313205719
455,10 done, 459.6519458293915
455,15 done, 459.92254996299744
460,0 done, 464.0831913948059
460,5 done, 464.3493025302887
460,10 done, 464.64526414871216
460,15 done, 464.90820121765137
465,0 done, 469.03287625312805
465,5 done, 469.3082573413849
465,10 done, 469.58028745651245
465,15 done, 469.8574597835541
470,0 done, 474.10878252983093
470,5 done, 474.3696496486664
470,10 done, 474.63843274116516
470,15 done, 474.9300651550293
475,0 done, 479.0802249908447
475,5 done, 479.3631842136383
475,10 done, 479.628892660141
475,15 done, 479.91116189956665
480,0 done, 484.05895352363586
480,5 done, 484.33325934410095
480,10 done, 484.6120719909668
480,15 done, 484.8843674659729
485,0 done, 489.08441138267517
485,5 done, 489.36877155303955
485,10 done, 489.64523363113403
485,15 done, 489.9195625782013
490,0 done, 494.0961422920227
490,5 done, 494.3709843158722
490,10 done, 494.65176463127136
490,15 done, 494.91996145248413
495,0 done, 499.2552535533905
495,5 done, 499.54716444015503
495,10 done, 499.8217990398407
495,15 done, 500.10638642311096
500,0 done, 504.4313666820526
500,5 done, 504.72011852264404
500,10 done, 504.9935176372528
500,15 done, 505.280668258667
505,0 done, 509.5926010608673
505,5 done, 509.8793866634369
505,10 done, 510.13716340065
505,15 done, 510.43706250190735
510,0 done, 514.7864050865173
510,5 done, 515.0718212127686
510,10 done, 515.354480266571
510,15 done, 515.6221823692322
515,0 done, 519.8753478527069
515,5 done, 520.1561980247498
515,10 done, 520.4393167495728
515,15 done, 520.7242593765259
520,0 done, 525.0359773635864
520,5 done, 525.312933921814
520,10 done, 525.5941216945648
520,15 done, 525.8661017417908
525,0 done, 530.218088388443
525,5 done, 530.5031416416168
525,10 done, 530.7850489616394
525,15 done, 531.0679726600647
530,0 done, 535.4243264198303
530,5 done, 535.7200374603271
530,10 done, 535.99968957901
530,15 done, 536.2847936153412
535,0 done, 540.6449003219604
535,5 done, 540.9155657291412
535,10 done, 541.209760427475
535,15 done, 541.4948670864105
540,0 done, 545.8411958217621
540,5 done, 546.121990442276
540,10 done, 546.4087655544281
540,15 done, 546.6889855861664
545,0 done, 551.0360252857208
545,5 done, 551.3018114566803
545,10 done, 551.5975475311279
545,15 done, 551.8622314929962
550,0 done, 556.1928405761719
550,5 done, 556.4822909832001
550,10 done, 556.7638664245605
550,15 done, 557.0472047328949
555,0 done, 561.4294264316559
555,5 done, 561.7187008857727
555,10 done, 561.9917771816254
555,15 done, 562.2758059501648
560,0 done, 566.6590075492859
560,5 done, 566.9476583003998
560,10 done, 567.2245280742645
560,15 done, 567.5043573379517
565,0 done, 571.7852427959442
565,5 done, 572.0702300071716
565,10 done, 572.3465735912323
565,15 done, 572.6115500926971
570,0 done, 576.8374016284943
570,5 done, 577.1153447628021
570,10 done, 577.3969025611877
570,15 done, 577.6735191345215
575,0 done, 581.9362254142761
575,5 done, 582.2178158760071
575,10 done, 582.4650197029114
575,15 done, 582.7390463352203
580,0 done, 586.952977180481
580,5 done, 587.2221784591675
580,10 done, 587.5044434070587
580,15 done, 587.7847306728363
585,0 done, 591.9671263694763
585,5 done, 592.2525153160095
585,10 done, 592.5224878787994
585,15 done, 592.8013548851013
590,0 done, 596.9636476039886
590,5 done, 597.2385721206665
590,10 done, 597.5195469856262
590,15 done, 597.7908568382263
595,0 done, 602.0456335544586
595,5 done, 602.3154339790344
595,10 done, 602.5944061279297
595,15 done, 602.8628342151642
600,0 done, 607.0339159965515
600,5 done, 607.2962026596069
600,10 done, 607.5683290958405
600,15 done, 607.8636562824249
605,0 done, 612.0601444244385
605,5 done, 612.3437383174896
605,10 done, 612.6189403533936
605,15 done, 612.8660061359406
610,0 done, 617.0690739154816
610,5 done, 617.3497817516327
610,10 done, 617.6225843429565
610,15 done, 617.9030134677887
615,0 done, 622.0675826072693
615,5 done, 622.3402104377747
615,10 done, 622.6234769821167
615,15 done, 622.9033229351044
620,0 done, 627.129613161087
620,5 done, 627.4062819480896
620,10 done, 627.6558187007904
620,15 done, 627.9268844127655
625,0 done, 632.124080657959
625,5 done, 632.4104740619659
625,10 done, 632.6695499420166
625,15 done, 632.9564731121063
630,0 done, 637.1549327373505
630,5 done, 637.4316415786743
630,10 done, 637.7098526954651
630,15 done, 637.9892203807831
635,0 done, 642.2241199016571
635,5 done, 642.5014252662659
635,10 done, 642.7796883583069
635,15 done, 643.0554611682892
640,0 done, 647.3258881568909
640,5 done, 647.6019639968872
640,10 done, 647.8796949386597
640,15 done, 648.1600773334503
645,0 done, 652.3728315830231
645,5 done, 652.6490709781647
645,10 done, 652.9247798919678
645,15 done, 653.2050352096558
650,0 done, 657.4679977893829
650,5 done, 657.7156465053558
650,10 done, 658.0092985630035
650,15 done, 658.2637267112732
655,0 done, 662.5754718780518
655,5 done, 662.8478174209595
655,10 done, 663.1296441555023
655,15 done, 663.4150671958923
660,0 done, 667.7552168369293
660,5 done, 668.0531363487244
660,10 done, 668.3349828720093
660,15 done, 668.6215255260468
665,0 done, 672.988890171051
665,5 done, 673.2644264698029
665,10 done, 673.5445265769958
665,15 done, 673.8222689628601
670,0 done, 678.2126438617706
670,5 done, 678.4963080883026
670,10 done, 678.7728431224823
670,15 done, 679.0586297512054
675,0 done, 683.3918299674988
675,5 done, 683.6764545440674
675,10 done, 683.963515996933
675,15 done, 684.2404139041901
680,0 done, 688.5895857810974
680,5 done, 688.8738136291504
680,10 done, 689.1588642597198
680,15 done, 689.4532177448273
685,0 done, 693.7912862300873
685,5 done, 694.0671660900116
685,10 done, 694.3562915325165
685,15 done, 694.6468744277954
690,0 done, 698.9622225761414
690,5 done, 699.2441654205322
690,10 done, 699.5309822559357
690,15 done, 699.8142523765564
695,0 done, 704.176023721695
695,5 done, 704.4582962989807
695,10 done, 704.7455811500549
695,15 done, 705.0278754234314
700,0 done, 709.4494717121124
700,5 done, 709.735095500946
700,10 done, 710.0313882827759
700,15 done, 710.3094072341919
705,0 done, 714.6528625488281
705,5 done, 714.9334189891815
705,10 done, 715.2172918319702
705,15 done, 715.5013883113861
710,0 done, 719.897864818573
710,5 done, 720.1748478412628
710,10 done, 720.45676445961
710,15 done, 720.7433660030365
715,0 done, 725.0745985507965
715,5 done, 725.3476865291595
715,10 done, 725.6350479125977
715,15 done, 725.9191451072693
720,0 done, 730.1243464946747
720,5 done, 730.4036183357239
720,10 done, 730.6682891845703
720,15 done, 730.9664950370789
725,0 done, 735.1906020641327
725,5 done, 735.4627411365509
725,10 done, 735.7323019504547
725,15 done, 735.9839425086975
730,0 done, 740.2310519218445
730,5 done, 740.5022397041321
730,10 done, 740.7721447944641
730,15 done, 741.0643968582153
735,0 done, 745.3197801113129
735,5 done, 745.5991966724396
735,10 done, 745.8790557384491
735,15 done, 746.1433944702148
740,0 done, 750.393824338913
740,5 done, 750.6789584159851
740,10 done, 750.9453184604645
740,15 done, 751.1969521045685
745,0 done, 755.4414467811584
745,5 done, 755.7089140415192
745,10 done, 756.0033898353577
745,15 done, 756.2723898887634
750,0 done, 760.5033583641052
750,5 done, 760.7806828022003
750,10 done, 761.0586125850677
750,15 done, 761.3395943641663
755,0 done, 765.5693111419678
755,5 done, 765.8607499599457
755,10 done, 766.1120779514313
755,15 done, 766.412159204483
760,0 done, 770.5829894542694
760,5 done, 770.8523952960968
760,10 done, 771.1361811161041
760,15 done, 771.4068326950073
765,0 done, 775.6807835102081
765,5 done, 775.9448416233063
765,10 done, 776.2266187667847
765,15 done, 776.4898588657379
770,0 done, 780.7135100364685
770,5 done, 780.9960253238678
770,10 done, 781.2676248550415
770,15 done, 781.540034532547
775,0 done, 785.7953722476959
775,5 done, 786.0735812187195
775,10 done, 786.3476424217224
775,15 done, 786.6272139549255
780,0 done, 790.893114566803
780,5 done, 791.1702108383179
780,10 done, 791.4534525871277
780,15 done, 791.7239575386047
785,0 done, 795.9631559848785
785,5 done, 796.2398178577423
785,10 done, 796.5130569934845
785,15 done, 796.7889997959137
790,0 done, 801.0111815929413
790,5 done, 801.2948307991028
790,10 done, 801.581146478653
790,15 done, 801.8617186546326
795,0 done, 806.1055793762207
795,5 done, 806.3847088813782
795,10 done, 806.6642324924469
795,15 done, 806.9478843212128
800,0 done, 811.2108924388885
800,5 done, 811.467043876648
800,10 done, 811.7577788829803
800,15 done, 812.01353764534
805,0 done, 816.2692368030548
805,5 done, 816.5391526222229
805,10 done, 816.8168656826019
805,15 done, 817.094530582428
810,0 done, 821.4771661758423
810,5 done, 821.751677274704
810,10 done, 822.0233829021454
810,15 done, 822.2994844913483
815,0 done, 826.4803774356842
815,5 done, 826.7601511478424
815,10 done, 827.042578458786
815,15 done, 827.3095343112946
820,0 done, 831.6213455200195
820,5 done, 831.9081859588623
820,10 done, 832.199898481369
820,15 done, 832.4849102497101
825,0 done, 836.8639736175537
825,5 done, 837.1501755714417
825,10 done, 837.4386613368988
825,15 done, 837.7188942432404
830,0 done, 842.2067861557007
830,5 done, 842.4897561073303
830,10 done, 842.776419878006
830,15 done, 843.0630719661713
835,0 done, 847.3844666481018
835,5 done, 847.6636922359467
835,10 done, 847.9421803951263
835,15 done, 848.2229390144348
840,0 done, 852.6050081253052
840,5 done, 852.8891174793243
840,10 done, 853.17134308815
840,15 done, 853.4583516120911
845,0 done, 857.8573467731476
845,5 done, 858.1489543914795
845,10 done, 858.4046568870544
845,15 done, 858.6872222423553
850,0 done, 862.9923207759857
850,5 done, 863.2741010189056
850,10 done, 863.5625824928284
850,15 done, 863.8496580123901
855,0 done, 868.2418196201324
855,5 done, 868.5303859710693
855,10 done, 868.8160524368286
855,15 done, 869.102077960968
860,0 done, 873.4994742870331
860,5 done, 873.7883849143982
860,10 done, 874.0711283683777
860,15 done, 874.3665254116058
865,0 done, 878.7859773635864
865,5 done, 879.0804629325867
865,10 done, 879.3377728462219
865,15 done, 879.6212348937988
870,0 done, 884.0143649578094
870,5 done, 884.2982897758484
870,10 done, 884.5915343761444
870,15 done, 884.8753366470337
875,0 done, 889.2721331119537
875,5 done, 889.5469808578491
875,10 done, 889.83891248703
875,15 done, 890.1276779174805
880,0 done, 894.5073704719543
880,5 done, 894.7919833660126
880,10 done, 895.0795361995697
880,15 done, 895.3605134487152
885,0 done, 899.6969661712646
885,5 done, 899.9802167415619
885,10 done, 900.2674098014832
885,15 done, 900.5604457855225
890,0 done, 904.9466788768768
890,5 done, 905.2319974899292
890,10 done, 905.5117061138153
890,15 done, 905.7945983409882
895,0 done, 910.1624882221222
895,5 done, 910.4474611282349
895,10 done, 910.7289435863495
895,15 done, 911.0159175395966
900,0 done, 915.3851385116577
900,5 done, 915.6591608524323
900,10 done, 915.9490976333618
900,15 done, 916.2410159111023
905,0 done, 920.6494598388672
905,5 done, 920.9370155334473
905,10 done, 921.2071959972382
905,15 done, 921.486394405365
910,0 done, 925.9617817401886
910,5 done, 926.2441463470459
910,10 done, 926.5272858142853
910,15 done, 926.8141684532166
915,0 done, 931.2210638523102
915,5 done, 931.5012364387512
915,10 done, 931.7902836799622
915,15 done, 932.0779583454132
920,0 done, 936.4148895740509
920,5 done, 936.7008972167969
920,10 done, 936.9833786487579
920,15 done, 937.2718968391418
925,0 done, 941.666154384613
925,5 done, 941.9432063102722
925,10 done, 942.2414374351501
925,15 done, 942.5181872844696
930,0 done, 946.9199805259705
930,5 done, 947.2022395133972
930,10 done, 947.4846925735474
930,15 done, 947.7730929851532
935,0 done, 952.1855492591858
935,5 done, 952.4641237258911
935,10 done, 952.7553219795227
935,15 done, 953.0346581935883
940,0 done, 957.2769856452942
940,5 done, 957.5664367675781
940,10 done, 957.8540024757385
940,15 done, 958.1457755565643
945,0 done, 962.5507564544678
945,5 done, 962.8432266712189
945,10 done, 963.1270995140076
945,15 done, 963.4140055179596
950,0 done, 967.8107125759125
950,5 done, 968.1003966331482
950,10 done, 968.3898718357086
950,15 done, 968.6730215549469
955,0 done, 973.0658066272736
955,5 done, 973.3453478813171
955,10 done, 973.6333844661713
955,15 done, 973.916127204895
960,0 done, 978.2917811870575
960,5 done, 978.5568854808807
960,10 done, 978.8455209732056
960,15 done, 979.1360175609589
965,0 done, 983.5020980834961
965,5 done, 983.7733964920044
965,10 done, 984.0641558170319
965,15 done, 984.3492908477783
970,0 done, 988.7335839271545
970,5 done, 989.0228207111359
970,10 done, 989.3108632564545
970,15 done, 989.5975909233093
975,0 done, 993.9689362049103
975,5 done, 994.25949883461
975,10 done, 994.5435929298401
975,15 done, 994.8284201622009
980,0 done, 999.2368927001953
980,5 done, 999.52472448349
980,10 done, 999.8201823234558
980,15 done, 1000.1041505336761
985,0 done, 1004.4944958686829
985,5 done, 1004.7743382453918
985,10 done, 1005.0640025138855
985,15 done, 1005.3557107448578
990,0 done, 1009.8037850856781
990,5 done, 1010.087170124054
990,10 done, 1010.3725650310516
990,15 done, 1010.6510910987854
995,0 done, 1015.0388147830963
995,5 done, 1015.3268163204193
995,10 done, 1015.6026592254639
995,15 done, 1015.8919188976288
1000,0 done, 1020.292563199997
1000,5 done, 1020.5696680545807
1000,10 done, 1020.8624000549316
1000,15 done, 1021.1517825126648
1005,0 done, 1025.5269122123718
1005,5 done, 1025.8058607578278
1005,10 done, 1026.0920014381409
1005,15 done, 1026.380128145218
1010,0 done, 1030.7932331562042
1010,5 done, 1031.076064825058
1010,10 done, 1031.3664486408234
1010,15 done, 1031.6582448482513
1015,0 done, 1036.016799211502
1015,5 done, 1036.2971568107605
1015,10 done, 1036.5800948143005
1015,15 done, 1036.8655347824097
1020,0 done, 1041.2482006549835
1020,5 done, 1041.5398230552673
1020,10 done, 1041.8276278972626
1020,15 done, 1042.1181259155273
1025,0 done, 1046.4189794063568
1025,5 done, 1046.6972961425781
1025,10 done, 1046.9841032028198
1025,15 done, 1047.2734334468842
1030,0 done, 1051.6837584972382
1030,5 done, 1051.9602708816528
1030,10 done, 1052.2345156669617
1030,15 done, 1052.5315897464752
1035,0 done, 1056.8803253173828
1035,5 done, 1057.1647400856018
1035,10 done, 1057.4587049484253
1035,15 done, 1057.7395386695862
1040,0 done, 1062.1494750976562
1040,5 done, 1062.4301352500916
1040,10 done, 1062.721010684967
1040,15 done, 1063.0033733844757
1045,0 done, 1067.403354883194
1045,5 done, 1067.684360742569
1045,10 done, 1067.964678287506
1045,15 done, 1068.2606790065765
1050,0 done, 1072.6394410133362
1050,5 done, 1072.9182677268982
1050,10 done, 1073.2000091075897
1050,15 done, 1073.49178481102
1055,0 done, 1077.8869438171387
1055,5 done, 1078.1658914089203
1055,10 done, 1078.458373785019
1055,15 done, 1078.748821735382
1060,0 done, 1083.129519701004
1060,5 done, 1083.4134600162506
1060,10 done, 1083.704926252365
1060,15 done, 1083.9911887645721
1065,0 done, 1088.3839826583862
1065,5 done, 1088.6764409542084
1065,10 done, 1088.9610440731049
1065,15 done, 1089.2446382045746
1070,0 done, 1093.6419093608856
1070,5 done, 1093.9129869937897
1070,10 done, 1094.1967902183533
1070,15 done, 1094.4655084609985
1075,0 done, 1098.770290851593
1075,5 done, 1099.0566771030426
1075,10 done, 1099.3425967693329
1075,15 done, 1099.6270215511322
1080,0 done, 1104.0077860355377
1080,5 done, 1104.2915165424347
1080,10 done, 1104.5860631465912
1080,15 done, 1104.8722350597382
1085,0 done, 1109.236074924469
1085,5 done, 1109.5148572921753
1085,10 done, 1109.7991902828217
1085,15 done, 1110.085428237915
1090,0 done, 1114.4971516132355
1090,5 done, 1114.7775840759277
1090,10 done, 1115.0680482387543
1090,15 done, 1115.3564898967743
1095,0 done, 1119.733363866806
1095,5 done, 1120.0189459323883
1095,10 done, 1120.3063745498657
1095,15 done, 1120.5882532596588
1100,0 done, 1124.9957702159882
1100,5 done, 1125.2812929153442
1100,10 done, 1125.5768175125122
1100,15 done, 1125.858867406845
1105,0 done, 1130.268595457077
1105,5 done, 1130.5482759475708
1105,10 done, 1130.823340177536
1105,15 done, 1131.0985162258148
1110,0 done, 1135.3941068649292
1110,5 done, 1135.6760725975037
1110,10 done, 1135.9476432800293
1110,15 done, 1136.2346460819244
1115,0 done, 1140.5023293495178
1115,5 done, 1140.7894322872162
1115,10 done, 1141.0632362365723
1115,15 done, 1141.3477919101715
1120,0 done, 1145.5996506214142
1120,5 done, 1145.8808016777039
1120,10 done, 1146.162737607956
1120,15 done, 1146.4395127296448
1125,0 done, 1150.7161781787872
1125,5 done, 1150.9943566322327
1125,10 done, 1151.2688057422638
1125,15 done, 1151.5495381355286
1130,0 done, 1155.8192601203918
1130,5 done, 1156.0978605747223
1130,10 done, 1156.373423576355
1130,15 done, 1156.6533255577087
1135,0 done, 1160.9056115150452
1135,5 done, 1161.187650680542
1135,10 done, 1161.4656240940094
1135,15 done, 1161.7416336536407
1140,0 done, 1166.0166244506836
1140,5 done, 1166.292489528656
1140,10 done, 1166.570408821106
1140,15 done, 1166.851573228836
1145,0 done, 1171.0578076839447
1145,5 done, 1171.331726551056
1145,10 done, 1171.612963438034
1145,15 done, 1171.8917789459229
1150,0 done, 1176.1561117172241
1150,5 done, 1176.4364132881165
1150,10 done, 1176.7107977867126
1150,15 done, 1176.987321138382
1155,0 done, 1181.2489268779755
1155,5 done, 1181.520659685135
1155,10 done, 1181.803949356079
1155,15 done, 1182.0831470489502
1160,0 done, 1186.347627878189
1160,5 done, 1186.6355175971985
1160,10 done, 1186.9172685146332
1160,15 done, 1187.1981542110443
1165,0 done, 1191.4484939575195
1165,5 done, 1191.724142074585
1165,10 done, 1191.9999377727509
1165,15 done, 1192.2816755771637
1170,0 done, 1196.5937259197235
1170,5 done, 1196.867217540741
1170,10 done, 1197.1487972736359
1170,15 done, 1197.4347579479218
1175,0 done, 1201.6635837554932
1175,5 done, 1201.945675611496
1175,10 done, 1202.224086523056
1175,15 done, 1202.501743555069
1180,0 done, 1206.7694919109344
1180,5 done, 1207.0450048446655
1180,10 done, 1207.325566291809
1180,15 done, 1207.5961349010468
1185,0 done, 1211.8595299720764
1185,5 done, 1212.1404039859772
1185,10 done, 1212.422812461853
1185,15 done, 1212.6959252357483
1190,0 done, 1216.9548873901367
1190,5 done, 1217.2315192222595
1190,10 done, 1217.5136504173279
1190,15 done, 1217.7897448539734
1195,0 done, 1222.0654270648956
1195,5 done, 1222.3316173553467
1195,10 done, 1222.6171822547913
1195,15 done, 1222.8862719535828
1200,0 done, 1227.198169708252
1200,5 done, 1227.472333908081
1200,10 done, 1227.7556042671204
1200,15 done, 1228.038906097412
1205,0 done, 1232.2589263916016
1205,5 done, 1232.5338411331177
1205,10 done, 1232.803099155426
1205,15 done, 1233.0916182994843
1210,0 done, 1237.3449256420135
1210,5 done, 1237.6192302703857
1210,10 done, 1237.897938966751
1210,15 done, 1238.1805198192596
1215,0 done, 1242.404221534729
1215,5 done, 1242.6863224506378
1215,10 done, 1242.9630308151245
1215,15 done, 1243.2446374893188
1220,0 done, 1247.5561499595642
1220,5 done, 1247.8288509845734
1220,10 done, 1248.115648984909
1220,15 done, 1248.3973772525787
1225,0 done, 1252.672382593155
1225,5 done, 1252.9438087940216
1225,10 done, 1253.2243266105652
1225,15 done, 1253.5034592151642
1230,0 done, 1257.8050847053528
1230,5 done, 1258.0927331447601
1230,10 done, 1258.3730342388153
1230,15 done, 1258.652156829834
1235,0 done, 1262.9399600028992
1235,5 done, 1263.210993051529
1235,10 done, 1263.4909036159515
1235,15 done, 1263.7753131389618
1240,0 done, 1268.0354998111725
1240,5 done, 1268.3196654319763
1240,10 done, 1268.5911135673523
1240,15 done, 1268.8780944347382
1245,0 done, 1273.143345117569
1245,5 done, 1273.4263741970062
1245,10 done, 1273.7052874565125
1245,15 done, 1273.977172613144
1250,0 done, 1278.2499623298645
1250,5 done, 1278.5354166030884
1250,10 done, 1278.8043982982635
1250,15 done, 1279.092277765274
1255,0 done, 1283.5106468200684
1255,5 done, 1283.781147480011
1255,10 done, 1284.0767843723297
1255,15 done, 1284.3738152980804
1260,0 done, 1288.7941825389862
1260,5 done, 1289.0830874443054
1260,10 done, 1289.3742253780365
1260,15 done, 1289.6644303798676
1265,0 done, 1294.0469517707825
1265,5 done, 1294.332862854004
1265,10 done, 1294.6230688095093
1265,15 done, 1294.903118610382
1270,0 done, 1299.3652052879333
1270,5 done, 1299.66392993927
1270,10 done, 1299.950447320938
1270,15 done, 1300.246829509735
1275,0 done, 1304.6893606185913
1275,5 done, 1304.9760165214539
1275,10 done, 1305.2732141017914
1275,15 done, 1305.56423163414
1280,0 done, 1309.9902613162994
1280,5 done, 1310.2789947986603
1280,10 done, 1310.5654938220978
1280,15 done, 1310.8554456233978
1285,0 done, 1315.3180947303772
1285,5 done, 1315.602243900299
1285,10 done, 1315.8937718868256
1285,15 done, 1316.190182209015
1290,0 done, 1320.6255223751068
1290,5 done, 1320.9153006076813
1290,10 done, 1321.1891181468964
1290,15 done, 1321.4846577644348
1295,0 done, 1325.9444034099579
1295,5 done, 1326.221796989441
1295,10 done, 1326.5213205814362
1295,15 done, 1326.8059651851654
1300,0 done, 1331.289895772934
1300,5 done, 1331.5779581069946
1300,10 done, 1331.8661901950836
1300,15 done, 1332.1664481163025
1305,0 done, 1336.6263513565063
1305,5 done, 1336.9078183174133
1305,10 done, 1337.1978058815002
1305,15 done, 1337.4912497997284
1310,0 done, 1341.9573850631714
1310,5 done, 1342.240403175354
1310,10 done, 1342.5242319107056
1310,15 done, 1342.8203570842743
1315,0 done, 1347.2777073383331
1315,5 done, 1347.5598759651184
1315,10 done, 1347.8512926101685
1315,15 done, 1348.1488103866577
1320,0 done, 1352.6020736694336
1320,5 done, 1352.8901252746582
1320,10 done, 1353.1793999671936
1320,15 done, 1353.47087931633
1325,0 done, 1357.870010137558
1325,5 done, 1358.1560714244843
1325,10 done, 1358.436113834381
1325,15 done, 1358.726315498352
1330,0 done, 1363.2116422653198
1330,5 done, 1363.4920978546143
1330,10 done, 1363.776396036148
1330,15 done, 1364.0685241222382
1335,0 done, 1368.555593252182
1335,5 done, 1368.8432149887085
1335,10 done, 1369.1355984210968
1335,15 done, 1369.419828414917
1340,0 done, 1373.880407333374
1340,5 done, 1374.1606640815735
1340,10 done, 1374.4566433429718
1340,15 done, 1374.7302160263062
1345,0 done, 1379.1529858112335
1345,5 done, 1379.4387147426605
1345,10 done, 1379.7273545265198
1345,15 done, 1380.005935907364
1350,0 done, 1384.456226825714
1350,5 done, 1384.7418467998505
1350,10 done, 1385.0337941646576
1350,15 done, 1385.320437669754
1355,0 done, 1389.7390677928925
1355,5 done, 1390.0194087028503
1355,10 done, 1390.3147356510162
1355,15 done, 1390.5938243865967
1360,0 done, 1395.0651240348816
1360,5 done, 1395.3431298732758
1360,10 done, 1395.6388103961945
1360,15 done, 1395.9253718852997
1365,0 done, 1400.3528988361359
1365,5 done, 1400.6398475170135
1365,10 done, 1400.9173476696014
1365,15 done, 1401.2045514583588
1370,0 done, 1405.6582164764404
1370,5 done, 1405.944248199463
1370,10 done, 1406.2306587696075
1370,15 done, 1406.5185101032257
1375,0 done, 1410.9868590831757
1375,5 done, 1411.2783670425415
1375,10 done, 1411.5689027309418
1375,15 done, 1411.8499228954315
1380,0 done, 1416.280156135559
1380,5 done, 1416.5612716674805
1380,10 done, 1416.8538286685944
1380,15 done, 1417.133064031601
1385,0 done, 1421.5894074440002
1385,5 done, 1421.8790233135223
1385,10 done, 1422.1646699905396
1385,15 done, 1422.4518551826477
1390,0 done, 1426.8892378807068
1390,5 done, 1427.1749420166016
1390,10 done, 1427.4678270816803
1390,15 done, 1427.7568488121033
1395,0 done, 1432.2238388061523
1395,5 done, 1432.5165808200836
1395,10 done, 1432.7975931167603
1395,15 done, 1433.0853350162506
1400,0 done, 1437.5340840816498
1400,5 done, 1437.8241851329803
1400,10 done, 1438.105785369873
1400,15 done, 1438.3948693275452
1405,0 done, 1442.8307745456696
1405,5 done, 1443.1074397563934
1405,10 done, 1443.3991956710815
1405,15 done, 1443.6927409172058
1410,0 done, 1448.1311299800873
1410,5 done, 1448.4112389087677
1410,10 done, 1448.7007024288177
1410,15 done, 1448.9871485233307
1415,0 done, 1453.4504115581512
1415,5 done, 1453.739330291748
1415,10 done, 1454.019003868103
1415,15 done, 1454.2978746891022
1420,0 done, 1458.7969460487366
1420,5 done, 1459.0788702964783
1420,10 done, 1459.3702671527863
1420,15 done, 1459.660877943039
1425,0 done, 1464.1276466846466
1425,5 done, 1464.4108002185822
1425,10 done, 1464.7030584812164
1425,15 done, 1464.9952533245087
1430,0 done, 1469.44908452034
1430,5 done, 1469.7254812717438
1430,10 done, 1470.019365310669
1430,15 done, 1470.3115231990814
1435,0 done, 1474.7442061901093
1435,5 done, 1475.0177021026611
1435,10 done, 1475.31063246727
1435,15 done, 1475.604742527008
1440,0 done, 1479.9793946743011
1440,5 done, 1480.2744054794312
1440,10 done, 1480.5588250160217
1440,15 done, 1480.848289012909
1445,0 done, 1485.3021142482758
1445,5 done, 1485.5885136127472
1445,10 done, 1485.8802309036255
1445,15 done, 1486.1670134067535
1450,0 done, 1490.6566007137299
1450,5 done, 1490.9395499229431
1450,10 done, 1491.2314400672913
1450,15 done, 1491.5234537124634
1455,0 done, 1495.9523499011993
1455,5 done, 1496.2443075180054
1455,10 done, 1496.5368492603302
1455,15 done, 1496.8190381526947
1460,0 done, 1501.1923081874847
1460,5 done, 1501.4696683883667
1460,10 done, 1501.754490852356
1460,15 done, 1502.0424206256866
1465,0 done, 1506.50576877594
1465,5 done, 1506.7909507751465
1465,10 done, 1507.0778632164001
1465,15 done, 1507.3689999580383
1470,0 done, 1511.842297077179
1470,5 done, 1512.1233472824097
1470,10 done, 1512.416803598404
1470,15 done, 1512.710957288742
1475,0 done, 1517.1613419055939
1475,5 done, 1517.4547715187073
1475,10 done, 1517.7403650283813
1475,15 done, 1518.0270719528198
1480,0 done, 1522.4738779067993
1480,5 done, 1522.766577720642
1480,10 done, 1523.0550026893616
1480,15 done, 1523.3425183296204
1485,0 done, 1527.8103561401367
1485,5 done, 1528.1033325195312
1485,10 done, 1528.3875641822815
1485,15 done, 1528.6712081432343
1490,0 done, 1533.0466799736023
1490,5 done, 1533.3303785324097
1490,10 done, 1533.6261901855469
1490,15 done, 1533.9132680892944
1495,0 done, 1538.3994448184967
1495,5 done, 1538.6864848136902
1495,10 done, 1538.9653491973877
1495,15 done, 1539.260353088379
1500,0 done, 1543.638282775879
1500,5 done, 1543.9250354766846
1500,10 done, 1544.214593410492
1500,15 done, 1544.5050191879272
1505,0 done, 1548.8984842300415
1505,5 done, 1549.1821494102478
1505,10 done, 1549.466402053833
1505,15 done, 1549.752501964569
1510,0 done, 1554.2031362056732
1510,5 done, 1554.493281841278
1510,10 done, 1554.7908072471619
1510,15 done, 1555.0753591060638
1515,0 done, 1559.5585503578186
1515,5 done, 1559.8379545211792
1515,10 done, 1560.1321606636047
1515,15 done, 1560.420693397522
1520,0 done, 1564.879062652588
1520,5 done, 1565.1623225212097
1520,10 done, 1565.451782464981
1520,15 done, 1565.7412202358246
1525,0 done, 1570.206297159195
1525,5 done, 1570.4942679405212
1525,10 done, 1570.7786943912506
1525,15 done, 1571.070872783661
1530,0 done, 1575.5144588947296
1530,5 done, 1575.8031005859375
1530,10 done, 1576.0884537696838
1530,15 done, 1576.3824422359467
1535,0 done, 1580.8448729515076
1535,5 done, 1581.128226518631
1535,10 done, 1581.4175453186035
1535,15 done, 1581.7058882713318
1540,0 done, 1586.164858341217
1540,5 done, 1586.4474453926086
1540,10 done, 1586.7444491386414
1540,15 done, 1587.0311632156372
1545,0 done, 1591.4871175289154
1545,5 done, 1591.7666375637054
1545,10 done, 1592.0594322681427
1545,15 done, 1592.3514547348022
1550,0 done, 1596.838164806366
1550,5 done, 1597.1234109401703
1550,10 done, 1597.4022674560547
1550,15 done, 1597.6992835998535
1555,0 done, 1602.1487746238708
1555,5 done, 1602.4302070140839
1555,10 done, 1602.7262332439423
1555,15 done, 1602.999845981598
1560,0 done, 1607.459373474121
1560,5 done, 1607.7379531860352
1560,10 done, 1608.0324068069458
1560,15 done, 1608.32715177536
1565,0 done, 1612.793131351471
1565,5 done, 1613.0726613998413
1565,10 done, 1613.3642733097076
1565,15 done, 1613.6548962593079
1570,0 done, 1618.106910943985
1570,5 done, 1618.3916292190552
1570,10 done, 1618.6894607543945
1570,15 done, 1618.974604845047
1575,0 done, 1623.3689367771149
1575,5 done, 1623.6664776802063
1575,10 done, 1623.9582056999207
1575,15 done, 1624.25022315979
1580,0 done, 1628.5396060943604
1580,5 done, 1628.8150129318237
1580,10 done, 1629.095269203186
1580,15 done, 1629.377382516861
1585,0 done, 1633.658089876175
1585,5 done, 1633.9359622001648
1585,10 done, 1634.2099792957306
1585,15 done, 1634.4954874515533
1590,0 done, 1638.7988607883453
1590,5 done, 1639.0726022720337
1590,10 done, 1639.3576691150665
1590,15 done, 1639.6377985477448
1595,0 done, 1643.923364162445
1595,5 done, 1644.1949224472046
1595,10 done, 1644.4807980060577
1595,15 done, 1644.7643427848816
1600,0 done, 1648.9949202537537
1600,5 done, 1649.2699205875397
1600,10 done, 1649.5517718791962
1600,15 done, 1649.8374943733215
1605,0 done, 1654.0992937088013
1605,5 done, 1654.38671541214
1605,10 done, 1654.6593968868256
1605,15 done, 1654.9437880516052
1610,0 done, 1659.1948356628418
1610,5 done, 1659.4742982387543
1610,10 done, 1659.755402803421
1610,15 done, 1660.0431714057922
1615,0 done, 1664.343198299408
1615,5 done, 1664.6266205310822
1615,10 done, 1664.9086818695068
1615,15 done, 1665.1851456165314
1620,0 done, 1669.4755675792694
1620,5 done, 1669.754335641861
1620,10 done, 1670.0300176143646
1620,15 done, 1670.3133940696716
1625,0 done, 1674.6247735023499
1625,5 done, 1674.9123027324677
1625,10 done, 1675.1876277923584
1625,15 done, 1675.4713144302368
1630,0 done, 1679.7762053012848
1630,5 done, 1680.0508410930634
1630,10 done, 1680.3339908123016
1630,15 done, 1680.6247470378876
1635,0 done, 1684.9116768836975
1635,5 done, 1685.1884744167328
1635,10 done, 1685.47207736969
1635,15 done, 1685.758861064911
1640,0 done, 1690.0661549568176
1640,5 done, 1690.3442685604095
1640,10 done, 1690.6331582069397
1640,15 done, 1690.918221950531
1645,0 done, 1695.2315516471863
1645,5 done, 1695.5156552791595
1645,10 done, 1695.793860912323
1645,15 done, 1696.0813529491425
1650,0 done, 1700.4476068019867
1650,5 done, 1700.7275772094727
1650,10 done, 1701.0119335651398
1650,15 done, 1701.28173828125
1655,0 done, 1705.5868134498596
1655,5 done, 1705.8665626049042
1655,10 done, 1706.1491515636444
1655,15 done, 1706.4296061992645
1660,0 done, 1710.7366631031036
1660,5 done, 1711.0222730636597
1660,10 done, 1711.2992975711823
1660,15 done, 1711.585297346115
1665,0 done, 1715.8622040748596
1665,5 done, 1716.135176897049
1665,10 done, 1716.4167003631592
1665,15 done, 1716.7077248096466
1670,0 done, 1720.982511997223
1670,5 done, 1721.2617268562317
1670,10 done, 1721.5397174358368
1670,15 done, 1721.8112037181854
1675,0 done, 1726.082765340805
1675,5 done, 1726.3536608219147
1675,10 done, 1726.6240792274475
1675,15 done, 1726.899139881134
1680,0 done, 1731.1563973426819
1680,5 done, 1731.4409003257751
1680,10 done, 1731.731808423996
1680,15 done, 1732.0064458847046
1685,0 done, 1736.2261145114899
1685,5 done, 1736.497272014618
1685,10 done, 1736.7817151546478
1685,15 done, 1737.053953409195
1690,0 done, 1741.365031003952
1690,5 done, 1741.6398768424988
1690,10 done, 1741.923192024231
1690,15 done, 1742.2077927589417
1695,0 done, 1746.493324995041
1695,5 done, 1746.7710993289948
1695,10 done, 1747.0352575778961
1695,15 done, 1747.3261651992798
1700,0 done, 1751.7026813030243
1700,5 done, 1751.9838466644287
1700,10 done, 1752.266129732132
1700,15 done, 1752.5413556098938
1705,0 done, 1756.8853437900543
1705,5 done, 1757.1559665203094
1705,10 done, 1757.4530477523804
1705,15 done, 1757.7383739948273
1710,0 done, 1762.2250459194183
1710,5 done, 1762.5164391994476
1710,10 done, 1762.7980518341064
1710,15 done, 1763.0865623950958
1715,0 done, 1767.5114269256592
1715,5 done, 1767.7921414375305
1715,10 done, 1768.0839972496033
1715,15 done, 1768.3746898174286
1720,0 done, 1772.7811403274536
1720,5 done, 1773.0634455680847
1720,10 done, 1773.3448324203491
1720,15 done, 1773.6367201805115
1725,0 done, 1778.1351535320282
1725,5 done, 1778.4337902069092
1725,10 done, 1778.721649646759
1725,15 done, 1779.0179092884064
1730,0 done, 1783.5315346717834
1730,5 done, 1783.8199300765991
1730,10 done, 1784.1041901111603
1730,15 done, 1784.3871011734009
1735,0 done, 1788.7118699550629
1735,5 done, 1788.9948437213898
1735,10 done, 1789.2687556743622
1735,15 done, 1789.56090259552
1740,0 done, 1794.0425453186035
1740,5 done, 1794.3427095413208
1740,10 done, 1794.6251349449158
1740,15 done, 1794.9177000522614
1745,0 done, 1799.3805117607117
1745,5 done, 1799.6638362407684
1745,10 done, 1799.9652247428894
1745,15 done, 1800.248604774475
1750,0 done, 1804.736929178238
1750,5 done, 1805.0271327495575
1750,10 done, 1805.313205242157
1750,15 done, 1805.6027328968048
1755,0 done, 1810.0428881645203
1755,5 done, 1810.3235921859741
1755,10 done, 1810.6056263446808
1755,15 done, 1810.8874933719635
1760,0 done, 1815.28693151474
1760,5 done, 1815.5863530635834
1760,10 done, 1815.8716411590576
1760,15 done, 1816.1379835605621
1765,0 done, 1820.498405456543
1765,5 done, 1820.7805151939392
1765,10 done, 1821.0755670070648
1765,15 done, 1821.3610372543335
1770,0 done, 1825.6982960700989
1770,5 done, 1825.979425907135
1770,10 done, 1826.249035835266
1770,15 done, 1826.528493642807
1775,0 done, 1830.8511908054352
1775,5 done, 1831.125937461853
1775,10 done, 1831.4026646614075
1775,15 done, 1831.6832506656647
1780,0 done, 1836.009681224823
1780,5 done, 1836.2952706813812
1780,10 done, 1836.5763463974
1780,15 done, 1836.8522922992706
1785,0 done, 1841.1721234321594
1785,5 done, 1841.4476647377014
1785,10 done, 1841.7438611984253
1785,15 done, 1842.0183806419373
1790,0 done, 1846.3699111938477
1790,5 done, 1846.6484065055847
1790,10 done, 1846.935781955719
1790,15 done, 1847.2180392742157
1795,0 done, 1851.528379201889
1795,5 done, 1851.816967010498
1795,10 done, 1852.100044965744
1795,15 done, 1852.3791456222534
1800,0 done, 1856.7189676761627
1800,5 done, 1857.0005943775177
1800,10 done, 1857.2817344665527
1800,15 done, 1857.5671706199646
1805,0 done, 1861.9233078956604
1805,5 done, 1862.2011711597443
1805,10 done, 1862.4832408428192
1805,15 done, 1862.7702803611755
1810,0 done, 1867.0914704799652
1810,5 done, 1867.3716359138489
1810,10 done, 1867.6560761928558
1810,15 done, 1867.9405903816223
1815,0 done, 1872.3122537136078
1815,5 done, 1872.5958116054535
1815,10 done, 1872.881212234497
1815,15 done, 1873.1608266830444
1820,0 done, 1877.7236602306366
1820,5 done, 1877.9994888305664
1820,10 done, 1878.2660038471222
1820,15 done, 1878.555434703827
1825,0 done, 1882.8460788726807
1825,5 done, 1883.1310040950775
1825,10 done, 1883.4048540592194
1825,15 done, 1883.6852207183838
In [85]:
len(flattened)
Out[85]:
165478
In [86]:
tmp = pd.DataFrame(flattened,columns = ['new_id'])
tmp = pd.merge(tmp,df[['new_id','any_canonic_motif']],how='left',on='new_id')
motif_threshold,len(tmp),tmp['any_canonic_motif'].sum()/len(tmp)
Out[86]:
(0.72, 165478, 0.5836364954858048)
In [88]:
195574/165478
Out[88]:
1.1818731190853164
In [96]:
tmp.to_csv(tmp_dir+'ts.scoring.tsv',sep=str('\t'),header=True,index=None,quoting=csv.QUOTE_NONE)
In [97]:
len(tmp)
Out[97]:
165478
In [100]:
tmp = pd.merge(tmp,df[['new_id','t']],how='left',on='new_id')
In [105]:
tmp = pd.merge(tmp,df[['new_id','gene_id']],how='left',on='new_id')
In [107]:
tmp['t1'] = 1
tmp = pd.merge(tmp.drop('t1',1),tmp.groupby('gene_id').agg({'t1':sum}).reset_index(),how='inner',on='gene_id')
In [108]:
len(tmp.loc[tmp['t1']<4])/len(tmp)
Out[108]:
0.10560920484898295
In [111]:
sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(1,1, sharey=False, sharex=True,figsize=(3,3))

tmp['t1'] = 1
gr = tmp.groupby('gene_id').agg({'t1':sum}).reset_index()

ax = sns.histplot(gr['t1'],stat='density')
ax.set(xlabel = '# filtered PAS in the gene', title='points are GENES')
Out[111]:
[Text(0.5, 0, '# filtered PAS in the gene'),
 Text(0.5, 1.0, 'points are GENES')]
No description has been provided for this image
In [114]:
gr['t1'].quantile(0.5)
Out[114]:
4.0
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(1,1, sharey=False, sharex=True,figsize=(3,3))

ax = sns.histplot(df['t'],stat='density')
ax.set(xlabel = '# PAS in the gene')

Final quantification and figure about comparison with v2 and DL-retrieved atlas¶

In [ ]:
 
In [ ]:
 
In [ ]:
 
In [50]:
 
In [34]:
expr_feature = feature+'_sum'
scoring_feature = feature+'_ratio'

data['expr_cat'] = pd.qcut(data[expr_feature],q=5)
for expr_cat in list(data['expr_cat'].unique()):
    tmp = data.loc[data['expr_cat']==expr_cat].reset_index(drop=True)
    tmp = tmp[['new_id',scoring_feature,'any_canonic_motif']].sort_values(scoring_feature,ascending=False).reset_index(drop=True)
    tmp['t']=1
    tmp['t_cumsum'] = tmp['t'].cumsum()
    tmp['any_canonic_motif_cumul'] = tmp['any_canonic_motif'].cumsum()
    tmp['frac_cumul'] = tmp['any_canonic_motif_cumul']/tmp['t_cumsum']
    if tmp['frac_cumul'].max()>=motif_threshold:
        max_index = max(tmp.loc[tmp['frac_cumul']>motif_threshold].index)
        pas_to_append = list(tmp.loc[0:max_index]['new_id'])
        
In [32]:
Counter(data['any_canonic_motif'])
Out[32]:
Counter({0: 83380, 1: 16620})
In [ ]:
 
In [ ]:
def get_PAS_scores(L,sample,anchor_df,iterator):
    tmp = PAQR_median_expression_matrix.loc[PAQR_median_expression_matrix[sample]!=-1][['Row.names','exon',sample]].dropna().reset_index(drop=True)
    tmp[sample] = tmp[sample].astype('int')
    tmp = pd.merge(tmp,tmp.groupby('exon').agg({sample:sum}).reset_index().rename(columns={sample:'exon_sum'}),how='inner',on='exon')
    tmp = tmp.loc[tmp['exon_sum']>0].reset_index(drop=True)
    tmp['score'] = tmp.apply(lambda x:np.log10(stats.binomtest(x[sample], x['exon_sum'], p=0.5, alternative='greater').pvalue+10**(-300))*(-1)-np.log10(stats.binomtest(x[sample], x['exon_sum'], p=0.5, alternative='less').pvalue+10**(-300))*(-1),1)
    tmp = tmp[['Row.names','score']].rename(columns={'score':sample})
    tmp = pd.merge(anchor_df,tmp,how='left',on='Row.names')[[sample]]
    L.append(tmp)
    if iterator%100==0:
        print(str(iterator)+' done, '+str(time.time()-start_time))    

start_time = time.time()

anchor_df = PAQR_median_expression_matrix[['Row.names']]

with Manager() as manager:
    L = manager.list()
    processes = []
    i=0
    for sample in list_of_samples:
        p = Process(target=get_PAS_scores, args=(L,sample,anchor_df,i))  # Passing the list
        p.start()
        processes.append(p)
        i=i+1
    for p in processes:
        p.join()
    L = list(L)

res = pd.concat([anchor_df[['Row.names']]]+L,axis=1)

print(time.time()-start_time)
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [142]:
feature = 'score'
y_feature = feature+'_ratio'
x_feature = feature+'_sum'
x_feature_log = x_feature+'_log'

data = df.loc[df['t']==2][['new_id','gene_id','any_canonic_motif','t']+[feature,y_feature,x_feature]].copy()
data[x_feature_log] = np.log2(data[x_feature]+10**(-30))

sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(1,1, sharey=False, sharex=True,figsize=(3,3))

ax = sns.scatterplot(data = data.sample(1000),x=x_feature_log,y=y_feature,s=3)
No description has been provided for this image
In [143]:
data.head()
Out[143]:
new_id gene_id any_canonic_motif t score score_ratio score_sum score_sum_log
16 290 ENSG00000279928.2 1 2 0.116169 0.308413 0.376666 -1.408642
17 291 ENSG00000279928.2 0 2 0.260497 0.691587 0.376666 -1.408642
33 855 ENSG00000268663.1 0 2 0.124622 0.342533 0.363825 -1.458682
34 858 ENSG00000268663.1 0 2 0.239203 0.657467 0.363825 -1.458682
11279 27289 ENSG00000260972.1 1 2 2.564076 0.438579 5.846327 2.547530
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [6]:
 
In [7]:
merged_pas_motif_table = pd.merge(SCINPAS_full[['seqid','start','end','id','score']+tissues].rename(columns={'score':'score_1'}),
         merged_pas_motif_table,how='inner',on=['id'])
In [8]:
merged_pas_motif_table = merged_pas_motif_table.drop('score',1).rename(columns={'score_1':'score'}) # score_1 from SCINPAS 
In [9]:
len(merged_pas_motif_table)
Out[9]:
18432135
In [10]:
cols = list(merged_pas_motif_table.columns)
motifs = cols[-15:-4]
In [11]:
merged_pas_motif_table[motifs] = merged_pas_motif_table[motifs].astype('boolean')

merged_pas_motif_table['any_canonic_motif'] = (merged_pas_motif_table[motifs].sum(1)>0).astype('int')
In [12]:
 
Out[12]:
seqid start end id score strand class gene_id gene_name nose ... kidney penis ureter lung liver skin prostate uterus bloodImmune brain
0 chr1 9999 10000 chr1:10000:-:9999:10000:0.5596258722876915:3 0.559626 - true_intergenic NaN NaN 0.0 ... 0.0 0.000000 0.006918 0.0 0.000000 0.010019 0.0 0.0 0.000000 0.000000
1 chr1 10464 10465 chr1:10465:+:10464:10465:0.401783597747119:1 0.401784 + true_intergenic NaN NaN 0.0 ... 0.0 0.000000 0.000000 0.0 0.000000 0.000000 0.0 0.0 0.000000 0.004415
2 chr1 11533 11534 chr1:11534:-:11533:11534:0.2873875380716641:1 0.287388 - true_intergenic NaN NaN 0.0 ... 0.0 0.000000 0.000000 0.0 0.000000 0.000000 0.0 0.0 0.001145 0.000000
3 chr1 14350 14370 chr1:14355:-:14350:14370:52.50382345788897:82 52.503823 - antisense_TE NaN NaN 0.0 ... 0.0 0.085025 0.033230 0.0 0.081067 0.128423 0.0 0.0 0.139736 0.000000
4 chr1 14395 14425 chr1:14403:-:14395:14425:283.585503076957:117 283.585503 - TE ENSG00000227232.5 WASH7P 0.0 ... 0.0 0.000000 0.015700 0.0 0.013593 0.000000 0.0 0.0 1.081058 0.034856

5 rows × 27 columns

In [ ]:
merged_pas_motif_table.head()
In [ ]:
 
In [ ]:
 
In [77]:
 
In [66]:
 
In [ ]:
 
In [35]:
merged_pas_motif_table['score_cat'] = pd.qcut(merged_pas_motif_table['score'],10)
In [46]:
merged_pas_motif_table['score'].max()
Out[46]:
21713.1321024367
In [54]:
a = []
for quantile in list(pd.Series(range(0,101,2))/100)[:-1]:
    threshold = c['score'].quantile(quantile)
    tmp = merged_pas_motif_table.loc[merged_pas_motif_table['score']>threshold]
    n,m = len(tmp),tmp['any_canonic_motif'].sum()
    a.append([quantile,threshold,n,m/n])
In [68]:
len(polyAsite)/10**6
Out[68]:
0.569005
In [65]:
np.round(len(merged_pas_motif_table)/len(polyAsite),1)
Out[65]:
2.2
In [55]:
pd.DataFrame(a)
Out[55]:
0 1 2 3
0 0.00 0.023074 1228808 0.311388
1 0.02 0.023687 1204232 0.312293
2 0.04 0.024322 1179656 0.313075
3 0.06 0.024986 1155080 0.314088
4 0.08 0.025688 1130504 0.315049
5 0.10 0.026429 1105928 0.316039
6 0.12 0.027216 1081352 0.316867
7 0.14 0.028043 1056775 0.317816
8 0.16 0.028922 1032199 0.318822
9 0.18 0.029843 1007623 0.319764
10 0.20 0.030826 983047 0.320675
11 0.22 0.031868 958471 0.321649
12 0.24 0.032917 933895 0.323047
13 0.26 0.033967 909315 0.324519
14 0.28 0.035115 884742 0.325992
15 0.30 0.036380 860166 0.327324
16 0.32 0.037750 835590 0.328658
17 0.34 0.039132 811014 0.330296
18 0.36 0.040457 786438 0.332732
19 0.38 0.041920 761861 0.335018
20 0.40 0.043527 737285 0.337073
21 0.42 0.045256 712709 0.339137
22 0.44 0.047148 688133 0.341223
23 0.46 0.049229 663557 0.343313
24 0.48 0.051434 638981 0.345652
25 0.50 0.053772 614404 0.348209
26 0.52 0.056326 589828 0.350862
27 0.54 0.058981 565252 0.354000
28 0.56 0.061785 540676 0.357208
29 0.58 0.064720 516100 0.360610
30 0.60 0.068047 491524 0.364243
31 0.62 0.071745 466948 0.368268
32 0.64 0.075892 442371 0.372698
33 0.66 0.080524 417795 0.377281
34 0.68 0.085789 393219 0.382138
35 0.70 0.091739 368643 0.387535
36 0.72 0.098589 344067 0.393752
37 0.74 0.106429 319491 0.400111
38 0.76 0.115246 294914 0.407780
39 0.78 0.125782 270338 0.416157
40 0.80 0.138443 245762 0.425965
41 0.82 0.153796 221186 0.438332
42 0.84 0.173340 196610 0.452947
43 0.86 0.198312 172034 0.470698
44 0.88 0.231893 147457 0.493018
45 0.90 0.278884 122881 0.519934
46 0.92 0.352077 98305 0.557571
47 0.94 0.482353 73729 0.612635
48 0.96 0.783611 49153 0.697658
49 0.98 2.161783 24577 0.829637
In [ ]:
 
In [31]:
Counter(merged_pas_motif_table['any_canonic_motif'])
Out[31]:
Counter({1: 382636, 0: 846173})
In [59]:
382636/(382636+846173)
Out[59]:
0.31138769328675164
In [ ]:
 
In [17]:
 
In [ ]:
 
In [ ]:
 
In [4]:
merged_pas_motif_table.head()
Out[4]:
score class AAUAAA AUUAAA UAUAAA AGUAAA AAUACA AAUAUA CAUAAA GAUAAA ACUAAA AAUAGA phastcon entropy num_cs width any_canonic_motif
0 0.559626 true_intergenic False False False False False False False False False False 0.000000 0.000000 1 1 0
1 0.287388 true_intergenic False True False False False False False False False False 0.010700 0.000000 1 1 1
2 52.503823 antisense_TE True False False False False False False False False False 0.003725 0.618032 17 20 1
3 283.585503 TE True False False False False False False False False False 0.001900 0.391705 21 30 1
4 0.153752 TE False False False False False False False False False False 0.060475 0.000000 1 1 0
In [ ]:
 
In [ ]:
 
In [ ]:
def gini(x):
    return 1-np.sum([elem**2 for elem in x])
In [1]:
merged_pas_motif_table = pd.read_csv('/scicore/home/zavolan/moon0000/intergenic_analysis_2/result/rcs_motif_check/merged_rcs_motif_phastcon_entropy.bed',delimiter="\t",index_col=None,header=0,usecols = [4,6,7,8,9,11,12,13,14,16,17,18,19,20,21,22])
merged_pas_motif_table['class'] = merged_pas_motif_table['class'].astype('category')
cols = list(merged_pas_motif_table.columns)
merged_pas_motif_table[cols[2:-4]] = merged_pas_motif_table[cols[2:-4]].astype('boolean')

merged_pas_motif_table['any_canonic_motif'] = merged_pas_motif_table[cols[2:-4]].sum(1)
merged_pas_motif_table['any_canonic_motif'] = merged_pas_motif_table['any_canonic_motif'].astype('int')

merged_pas_motif_table['phastcon_quant'] = pd.qcut(merged_pas_motif_table['phastcon'],q=15,labels = list('q'+pd.Series(range(1,16)).astype('str')))

def gini(x):
    return 1-np.sum([elem**2 for elem in x])

df = pd.DataFrame(merged_pas_motif_table['num_cs'].unique(),columns=['num_cs'])
df['max_gini'] = df.apply(lambda x:gini([1/x['num_cs']]*x['num_cs']),1)

merged_pas_motif_table = pd.merge(merged_pas_motif_table,df,how='left',on='num_cs')
merged_pas_motif_table = merged_pas_motif_table.rename(columns = {'entropy':'gini'})
merged_pas_motif_table['max_gini'] = merged_pas_motif_table['max_gini']+10**(-5)
merged_pas_motif_table['normal_gini'] = merged_pas_motif_table['gini']/merged_pas_motif_table['max_gini']
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[1], line 1
----> 1 merged_pas_motif_table = pd.read_csv('/scicore/home/zavolan/moon0000/intergenic_analysis_2/result/rcs_motif_check/merged_rcs_motif_phastcon_entropy.bed',delimiter="\t",index_col=None,header=0,usecols = [4,6,7,8,9,11,12,13,14,16,17,18,19,20,21,22])
      2 merged_pas_motif_table['class'] = merged_pas_motif_table['class'].astype('category')
      3 cols = list(merged_pas_motif_table.columns)

NameError: name 'pd' is not defined
In [3]:
merged_pas_motif_table['num_cs'] = merged_pas_motif_table['num_cs'].astype('category')
merged_pas_motif_table['width'] = merged_pas_motif_table['width'].astype('category')
In [4]:
merged_pas_motif_table['RPM_log2'] = np.log2(merged_pas_motif_table['score'])
In [89]:
merged_pas_motif_table['RPM_quant'] = pd.qcut(merged_pas_motif_table['RPM_log2'],q=100,labels = list('q'+pd.Series(range(1,101)).astype('str')))
In [33]:
merged_pas_motif_table['normal_gini_bin'] = pd.cut(merged_pas_motif_table['normal_gini'],bins=20)
In [7]:
sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(3,1, sharey=False, sharex=True,figsize=(15,5))

data = merged_pas_motif_table.sample(200000)
data['t']=1

gr = data.groupby('num_cs').agg({'t':np.sum}).reset_index()
gr['%'] = np.round(gr['t']/gr['t'].sum()*100,2)
x_feature = 'num_cs'

ax = sns.barplot(ax = axes[0],data = gr,x=x_feature,y='%',color='blue')
ax.set(xlabel='')
ax = sns.pointplot(ax = axes[1],data = data, x=x_feature,y='RPM_log2',estimator=np.median,color='blue')
ax.set(xlabel='')
ax = sns.pointplot(ax = axes[2],data = data, x=x_feature,y='normal_gini',estimator=np.median,color='blue')
ax.set(xlabel='# of cleavage sites in PAS')
Out[7]:
[Text(0.5, 0, '# of cleavage sites in PAS')]
No description has been provided for this image
In [113]:
sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(3,1, sharey=False, sharex=True,figsize=(15,5))

x_feature = 'width'

data = merged_pas_motif_table.loc[merged_pas_motif_table['num_cs']==2].sample(200000)
data['t']=1

gr = data.groupby(x_feature).agg({'t':np.sum}).reset_index()
gr['%'] = np.round(gr['t']/gr['t'].sum()*100,2)

ax = sns.barplot(ax = axes[0],data = gr,x=x_feature,y='%',color='blue')
ax.set(xlabel='',title='PAS having 2 cleavage sites')
ax = sns.pointplot(ax = axes[1],data = data, x=x_feature,y='RPM_log2',estimator=np.median,color='blue')
ax.set(xlabel='')
ax = sns.pointplot(ax = axes[2],data = data, x=x_feature,y='normal_gini',estimator=np.median,color='blue')
ax.set(xlabel='PAS width, nt')
Out[113]:
[Text(0.5, 0, 'PAS width, nt')]
No description has been provided for this image
In [115]:
sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(3,1, sharey=False, sharex=True,figsize=(15,5))

x_feature = 'width'

data = merged_pas_motif_table.loc[merged_pas_motif_table['num_cs']==3].sample(200000)
data['t']=1

gr = data.groupby(x_feature).agg({'t':np.sum}).reset_index()
gr['%'] = np.round(gr['t']/gr['t'].sum()*100,2)

ax = sns.barplot(ax = axes[0],data = gr,x=x_feature,y='%',color='blue')
ax.set(xlabel='',title='PAS having 2 cleavage sites')
ax = sns.pointplot(ax = axes[1],data = data, x=x_feature,y='RPM_log2',estimator=np.median,color='blue')
ax.set(xlabel='')
ax = sns.pointplot(ax = axes[2],data = data, x=x_feature,y='normal_gini',estimator=np.median,color='blue')
ax.set(xlabel='PAS width, nt')
Out[115]:
[Text(0.5, 0, 'PAS width, nt')]
No description has been provided for this image
In [8]:
sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(3,1, sharey=False, sharex=True,figsize=(15,5))

data = merged_pas_motif_table.sample(200000)
data['t']=1

gr = data.groupby('num_cs').agg({'t':np.sum}).reset_index()
gr['%'] = np.round(gr['t']/gr['t'].sum()*100,2)
x_feature = 'num_cs'

ax = sns.barplot(ax = axes[0],data = gr,x=x_feature,y='%')
ax.set(xlabel='')
ax = sns.boxplot(ax = axes[1],data = data, x=x_feature,y='RPM_log2',showfliers=False)
ax.set(xlabel='')
ax = sns.boxplot(ax = axes[2],data = data, x=x_feature,y='normal_gini',showfliers=False)
ax.set(xlabel='# of cleavage sites in PAS')
Out[8]:
[Text(0.5, 0, '# of cleavage sites in PAS')]
No description has been provided for this image
In [100]:
sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(1,3, sharey=True, sharex=True,figsize=(10,3))

i = 0
for num_cs in [2,3,4]:
    data = merged_pas_motif_table.loc[merged_pas_motif_table['num_cs']==num_cs].sample(100000)
    data['RPM_quant'] = pd.qcut(data['RPM_log2'],q=15,labels = list('q'+pd.Series(range(1,16)).astype('str')))
    
    ax = sns.histplot(ax=axes[i], data = data,x='RPM_log2',y='normal_gini',stat='density')
    ax.set(title = str(num_cs)+' cleavage sites')
    if i>0:
        ax.set(ylabel='')
    i=i+1
fig.tight_layout(pad=0.5)
No description has been provided for this image
In [94]:
sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(1,1, sharey=True, sharex=True,figsize=(10,3))

data = merged_pas_motif_table.loc[merged_pas_motif_table['num_cs'].isin([2,3,4,5])]
data['RPM_quant'] = pd.qcut(data['RPM_log2'],q=10,labels = list('q'+pd.Series(range(1,11)).astype('str')))
    
ax = sns.boxplot(data = data,x='RPM_quant',y='normal_gini',hue='num_cs',showfliers=False,saturation=1)
ax.legend(bbox_to_anchor=(1.05,1),loc=2,borderaxespad=0,title='# of cleavage\nsites',markerscale=1.5,ncols=2,fontsize=9,mode=None)
ax.set(xlabel = 'total RPM quantile, in a given # of cleavage sites')
Out[94]:
[Text(0.5, 0, 'total RPM quantile, in a given # of cleavage sites')]
No description has been provided for this image
In [119]:
len(merged_pas_motif_table)*0.06*0.2
Out[119]:
221185.62
In [9]:
merged_pas_motif_table.loc[merged_pas_motif_table['RPM_quant'].astype('str')=='q15']['score'].min()
Out[9]:
18.75919782185748
In [123]:
18/900
Out[123]:
0.02
In [86]:
sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(1,2, sharey=False, sharex=False,figsize=(8,5))

data = merged_pas_motif_table.sample(20000)

x_feature = 'RPM_log2'
ax = sns.histplot(ax=axes[0],data = data, x = x_feature,stat='density',hue='RPM_quant',alpha=1,legend=False)

x_feature = 'phastcon'
ax = sns.histplot(ax=axes[1],data = data, x = x_feature,stat='density',hue='phastcon_quant',alpha=1)
No description has been provided for this image
In [102]:
sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(1,6, sharey=False, sharex=False,figsize=(15,3))

data = merged_pas_motif_table.loc[merged_pas_motif_table['class'].isin(['TE'])].sample(20000)

ax = sns.boxplot(ax=axes[0],data = data, x = 'RPM_log2',y='phastcon_quant',showfliers=False,saturation=1)
xval = data.loc[data['phastcon_quant']=='q1']['RPM_log2'].median()
ax.vlines(x=xval,ymin=ax.get_ylim()[0],ymax=ax.get_ylim()[1],color='red',linestyles='--',linewidth=1)
ax.set(title = 'within TE')

ax = sns.boxplot(ax=axes[1],data = data, x = 'phastcon',y='RPM_quant',showfliers=False,saturation=1)
xval = data.loc[data['RPM_quant']=='q1']['phastcon'].median()
ax.vlines(x=xval,ymin=ax.get_ylim()[0],ymax=ax.get_ylim()[1],color='red',linestyles='--',linewidth=1)
ax.set(title = 'within TE')

data = merged_pas_motif_table.loc[merged_pas_motif_table['class'].isin(['intronic'])].sample(20000)

ax = sns.boxplot(ax=axes[2],data = data, x = 'RPM_log2',y='phastcon_quant',showfliers=False,saturation=1)
xval = data.loc[data['phastcon_quant']=='q1']['RPM_log2'].median()
ax.vlines(x=xval,ymin=ax.get_ylim()[0],ymax=ax.get_ylim()[1],color='red',linestyles='--',linewidth=1)
ax.set(title = 'within intronic')

ax = sns.boxplot(ax=axes[3],data = data, x = 'phastcon',y='RPM_quant',showfliers=False,saturation=1)
xval = data.loc[data['RPM_quant']=='q1']['phastcon'].median()
ax.vlines(x=xval,ymin=ax.get_ylim()[0],ymax=ax.get_ylim()[1],color='red',linestyles='--',linewidth=1)
ax.set(title = 'within intronic')

data = merged_pas_motif_table.loc[merged_pas_motif_table['class'].isin(['true_intergenic'])].sample(20000)

ax = sns.boxplot(ax=axes[4],data = data, x = 'RPM_log2',y='phastcon_quant',showfliers=False,saturation=1)
xval = data.loc[data['phastcon_quant']=='q1']['RPM_log2'].median()
ax.vlines(x=xval,ymin=ax.get_ylim()[0],ymax=ax.get_ylim()[1],color='red',linestyles='--',linewidth=1)
ax.set(title = 'within intergenic')

ax = sns.boxplot(ax=axes[5],data = data, x = 'phastcon',y='RPM_quant',showfliers=False,saturation=1)
xval = data.loc[data['RPM_quant']=='q1']['phastcon'].median()
ax.vlines(x=xval,ymin=ax.get_ylim()[0],ymax=ax.get_ylim()[1],color='red',linestyles='--',linewidth=1)
ax.set(title = 'within intergenic')

fig.tight_layout(pad=0.5)
No description has been provided for this image
In [14]:
def entropy(x):
    return (-1)*np.sum([elem*np.log2(elem) for elem in x])

def normalized_entropy(x):
    ent = entropy(x)
    max_ent = entropy([1/len(x)]*len(x))
    return ent/max_ent
    
def gini(x):
    return 1-np.sum([elem**2 for elem in x])

l = [0.0001,0.999,0.0003,0.0006]
entropy(l),normalized_entropy(l),gini(l)
Out[14]:
(0.012703219581699458, 0.006351609790849729, 0.001998539999999882)
In [43]:
sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(1,1, sharey=False, sharex=False,figsize=(3,3))

x_feature = 'RPM_log2'
ax = sns.scatterplot(data = merged_pas_motif_table.sample(20000), x = 'RPM_log2',y='phastcon',s=5,alpha=0.5)
No description has been provided for this image
In [68]:
order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic']

sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(1,len(order), sharey=True, sharex=True,figsize=(15,3))

x_feature = 'RPM_log2'

i=0
for class_ in order:
    ax = sns.histplot(ax = axes[i],data = merged_pas_motif_table.loc[merged_pas_motif_table['class']==class_].sample(1000),x = x_feature,stat='density',alpha=1)
    ax.set(title=class_,xlabel=x_feature)
    i=i+1
No description has been provided for this image
In [91]:
merged_pas_motif_table['t']=1
gr = merged_pas_motif_table.groupby(['class','RPM_quant']).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby('class').agg({'t':sum}).reset_index().rename(columns={'t':'t_total'}),how='inner',on='class')
gr['%'] = np.round(gr['t']/gr['t_total']*100,2)

sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(2,1, sharey=False, sharex=True,figsize=(20,4))

ax = sns.barplot(ax=axes[0],data = gr,x='class',y='%',hue='RPM_quant',order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend(bbox_to_anchor=(1.05,1),loc=2,borderaxespad=0,title='expression\nquantile',markerscale=1.5,ncols=5,fontsize=9,mode=None)
ax.set(ylabel = '% within class',xlabel='')

merged_pas_motif_table['t']=1
gr = merged_pas_motif_table.groupby(['class','RPM_quant']).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby('RPM_quant').agg({'t':sum}).reset_index().rename(columns={'t':'t_total'}),how='inner',on='RPM_quant')
gr['%'] = np.round(gr['t']/gr['t_total']*100,2)


ax = sns.barplot(ax=axes[1],data = gr,x='class',y='%',hue='RPM_quant',order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
xmin,xmax = ax.get_xlim()[0],ax.get_xlim()[1]
ax.hlines(y=7,xmin=xmin,xmax=xmax,color='grey',linestyles='-',linewidth=0.7)
ax.hlines(y=3,xmin=xmin,xmax=xmax,color='grey',linestyles='-',linewidth=0.7)
ax.text(6.7,7,'7%',ha='right',va='bottom',size=8)
ax.text(6.7,3,'3%',ha='right',va='center',size=8)

ax.legend_.remove()
ax.set(ylabel = '% within\nexpression quantile')
Out[91]:
[Text(0, 0.5, '% within\nexpression quantile')]
No description has been provided for this image
In [120]:
merged_pas_motif_table['RPM_quant'] = pd.cut(merged_pas_motif_table['RPM_log2'],bins=100)
In [122]:
merged_pas_motif_table.head()
Out[122]:
score class AAUAAA AUUAAA UAUAAA AGUAAA AAUACA AAUAUA CAUAAA GAUAAA ... any_canonic_motif phastcon_quant max_gini normal_gini RPM_log2 RPM_quant normal_gini_bin t num_cs_bins any_canonic_motif_pres
0 0.559626 true_intergenic False False False False False False False False ... 0 q1 0.000010 0.000000 -0.837465 (-1.005, -0.71] (-0.001, 0.05] 1 (0, 1] False
1 0.287388 true_intergenic False True False False False False False False ... 1 q7 0.000010 0.000000 -1.798931 (-1.89, -1.595] (-0.001, 0.05] 1 (0, 1] True
2 52.503823 antisense_TE True False False False False False False False ... 1 q4 0.941186 0.656652 5.714351 (5.486, 5.781] (0.65, 0.7] 1 (8, 51] True
3 283.585503 TE True False False False False False False False ... 1 q3 0.952391 0.411286 8.147640 (8.141, 8.436] (0.4, 0.45] 1 (8, 51] True
4 0.153752 TE False False False False False False False False ... 0 q11 0.000010 0.000000 -2.701319 (-2.776, -2.481] (-0.001, 0.05] 1 (0, 1] False

5 rows × 26 columns

In [ ]:
 
In [126]:
sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(7,1, sharey=False, sharex=True,figsize=(10,10))

merged_pas_motif_table['t']=1
gr = merged_pas_motif_table.groupby(['class','RPM_quant']).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby('RPM_quant').agg({'t':sum}).reset_index().rename(columns={'t':'t_total'}),how='inner',on='RPM_quant')
gr['%'] = np.round(gr['t']/gr['t_total']*100,2)

i=0
for class_name in ['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic']:
    ax = sns.pointplot(ax=axes[i],data = gr.loc[gr['class']==class_name],x='RPM_quant',y='%')
    ax.set(xlabel='')
    i=i+1

ax.set(ylabel = '% within\nexpression quantile')
Out[126]:
[Text(0, 0.5, '% within\nexpression quantile')]
No description has been provided for this image
In [ ]:
 
In [25]:
feature = 'phastcon_quant'

merged_pas_motif_table['t']=1
gr = merged_pas_motif_table.groupby(['class',feature]).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby('class').agg({'t':sum}).reset_index().rename(columns={'t':'t_total'}),how='inner',on='class')
gr['%'] = np.round(gr['t']/gr['t_total']*100,2)

sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(2,1, sharey=False, sharex=True,figsize=(13,4))

ax = sns.barplot(ax=axes[0],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend(bbox_to_anchor=(1.05,1),loc=2,borderaxespad=0,title='phast cons\nquantile',markerscale=1.5,ncols=2,fontsize=9,mode=None)
ax.set(ylabel = '% within class',xlabel='')

merged_pas_motif_table['t']=1
gr = merged_pas_motif_table.groupby(['class',feature]).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby(feature).agg({'t':sum}).reset_index().rename(columns={'t':'t_total'}),how='inner',on=feature)
gr['%'] = np.round(gr['t']/gr['t_total']*100,2)


ax = sns.barplot(ax=axes[1],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend_.remove()
ax.set(ylabel = '% within\nphast cons quantile')
Out[25]:
[Text(0, 0.5, '% within\nphast cons quantile')]
No description has been provided for this image
In [26]:
merged_pas_motif_table['num_cs_bins'] = pd.cut(merged_pas_motif_table['num_cs'],bins = [0,1,2,3,4,5,6,7,8,51])
In [27]:
feature = 'num_cs_bins'
sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(2,1, sharey=True, sharex=True,figsize=(13,4))

merged_pas_motif_table['t']=1
gr = merged_pas_motif_table.loc[merged_pas_motif_table['RPM_quant'].isin(['q15'])].groupby(['class',feature]).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby(feature).agg({'t':sum}).reset_index().rename(columns={'t':'t_total'}),how='inner',on=feature)
gr['%'] = np.round(gr['t']/gr['t_total']*100,2)

ax = sns.barplot(ax=axes[0],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend(bbox_to_anchor=(1.05,1),loc=2,borderaxespad=0,title='group:\n# of cleavage sites',markerscale=1.5,ncols=2,fontsize=9,mode=None)
ax.set(ylabel = '% within group',xlabel='',title = 'within top expression quantile')

merged_pas_motif_table['t']=1
gr = merged_pas_motif_table.loc[merged_pas_motif_table['RPM_quant'].isin(['q1'])].groupby(['class',feature]).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby(feature).agg({'t':sum}).reset_index().rename(columns={'t':'t_total'}),how='inner',on=feature)
gr['%'] = np.round(gr['t']/gr['t_total']*100,2)

ax = sns.barplot(ax=axes[1],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend_.remove()
ax.set(ylabel = '% within group',xlabel='',title = 'within lowest expression quantile')
Out[27]:
[Text(0, 0.5, '% within group'),
 Text(0.5, 0, ''),
 Text(0.5, 1.0, 'within lowest expression quantile')]
No description has been provided for this image
In [35]:
feature = 'normal_gini_bin'
sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(2,1, sharey=True, sharex=True,figsize=(13,4))

merged_pas_motif_table['t']=1
gr = merged_pas_motif_table.loc[merged_pas_motif_table['RPM_quant'].isin(['q15'])&(merged_pas_motif_table['num_cs']!=1)].groupby(['class',feature]).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby(feature).agg({'t':sum}).reset_index().rename(columns={'t':'t_total'}),how='inner',on=feature)
gr['%'] = np.round(gr['t']/gr['t_total']*100,2)

ax = sns.barplot(ax=axes[0],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend(bbox_to_anchor=(1.05,1),loc=2,borderaxespad=0,title='norm. gini index',markerscale=1.5,ncols=2,fontsize=9,mode=None)
ax.set(ylabel = '% within group',xlabel='',title = 'within top expression quantile')

merged_pas_motif_table['t']=1
gr = merged_pas_motif_table.loc[merged_pas_motif_table['RPM_quant'].isin(['q1'])&(merged_pas_motif_table['num_cs']!=1)].groupby(['class',feature]).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby(feature).agg({'t':sum}).reset_index().rename(columns={'t':'t_total'}),how='inner',on=feature)
gr['%'] = np.round(gr['t']/gr['t_total']*100,2)

ax = sns.barplot(ax=axes[1],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend_.remove()
ax.set(ylabel = '% within group',xlabel='',title = 'within lowest expression quantile')
Out[35]:
[Text(0, 0.5, '% within group'),
 Text(0.5, 0, ''),
 Text(0.5, 1.0, 'within lowest expression quantile')]
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [36]:
feature = 'phastcon_quant'

sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(2,1, sharey=True, sharex=True,figsize=(13,4))

merged_pas_motif_table['t']=1
gr = merged_pas_motif_table.loc[merged_pas_motif_table['RPM_quant'].isin(['q15'])].groupby(['class',feature]).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby(feature).agg({'t':sum}).reset_index().rename(columns={'t':'t_total'}),how='inner',on=feature)
gr['%'] = np.round(gr['t']/gr['t_total']*100,2)

ax = sns.barplot(ax=axes[0],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend(bbox_to_anchor=(1.05,1),loc=2,borderaxespad=0,title='phast cons quantile',markerscale=1.5,ncols=2,fontsize=9,mode=None)
ax.set(ylabel = '% within\nphast cons\nquantile',xlabel='',title = 'within top expression quantile')

merged_pas_motif_table['t']=1
gr = merged_pas_motif_table.loc[merged_pas_motif_table['RPM_quant'].isin(['q1'])].groupby(['class',feature]).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby(feature).agg({'t':sum}).reset_index().rename(columns={'t':'t_total'}),how='inner',on=feature)
gr['%'] = np.round(gr['t']/gr['t_total']*100,2)

ax = sns.barplot(ax=axes[1],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend_.remove()
ax.set(ylabel = '% within\nphast cons\nquantile',xlabel='',title = 'within lowest expression quantile')
Out[36]:
[Text(0, 0.5, '% within\nphast cons\nquantile'),
 Text(0.5, 0, ''),
 Text(0.5, 1.0, 'within lowest expression quantile')]
No description has been provided for this image
In [51]:
merged_pas_motif_table['t']=1

d = {}
for motif in cols[2:7]:
    d[motif] = np.sum
d['t'] = np.sum

gr = merged_pas_motif_table.groupby(['class','RPM_quant']).agg(d).reset_index()

sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(len(cols[2:7]),1, sharey=False, sharex=True,figsize=(8,1.5*len(cols[2:7])))

i=0
for motif in (cols[2:7]):
    gr[motif+'_%'] = np.round(gr[motif]/gr['t']*100,2)
    ax = sns.barplot(ax = axes[i],data = gr,x='class',y=motif+'_%',hue='RPM_quant',order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
    if i==0:
        ax.set(title = '% of PAS with motif, in (class x expr. quantile) category')
    if i!=len(cols[2:7])-1:
        ax.legend_.remove()
        ax.set(xlabel='',xticks=[])
    else:
        ax.legend(bbox_to_anchor=(1.05, 3),loc=2,borderaxespad=0,title='expression\nquantile',markerscale=1.5,ncols=1,fontsize=9,mode=None)
    ax.set_xticklabels(labels = ax.get_xticklabels(), rotation=60, ha='right',va='top',rotation_mode='anchor')
    i=i+1
# fig.tight_layout(pad=0.5)
No description has been provided for this image
In [54]:
merged_pas_motif_table['t']=1
merged_pas_motif_table['any_canonic_motif_pres'] = (merged_pas_motif_table['any_canonic_motif']>0)

y_feature = 'any_canonic_motif_pres'

sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(1,1, sharey=True, sharex=True,figsize=(13,2.5))

feature = 'RPM_quant'

gr = merged_pas_motif_table.groupby(['class',feature]).agg({'t':sum,y_feature:sum}).reset_index()
gr['%'] = np.round(gr[y_feature]/gr['t']*100,2)

ax = sns.barplot(data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend(bbox_to_anchor=(1.05,1),loc=2,borderaxespad=0,title='expression\nquantile',markerscale=1.5,ncols=2,fontsize=9,mode=None)
ax.set(title = '% of PAS with any canonical motif in (-35,-10) vicinity',xlabel='')
Out[54]:
[Text(0.5, 1.0, '% of PAS with any canonical motif in (-35,-10) vicinity'),
 Text(0.5, 0, '')]
No description has been provided for this image
In [58]:
142884.15536499827
148238.42076623856
6937.294029423888
15320.140209926649
8243.35382939582
4785.684895123209
24165.790583432936
In [ ]:
 
In [81]:
gr['motif_not_present'] = gr['t']-gr['any_canonic_motif_pres']

# remove the scale influence
gr['motif_pres'] = gr['any_canonic_motif_pres']/gr['t']*100
gr['motif_not_pres'] = gr['motif_not_present']/gr['t']*100

a = []
for class_name in ['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic']:
    a.append([class_name,stats.chi2_contingency(observed=gr.loc[gr['class']==class_name][['motif_not_pres','motif_pres']].values,)[0]])
a = pd.DataFrame(a,columns = ['class','chi2_stat'])
sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(1,1, sharey=True, sharex=True,figsize=(5,2.5))

ax = sns.pointplot(data = a,x='class',y='chi2_stat',order = ['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.set_xticklabels(labels = ax.get_xticklabels(), rotation=60, ha='right',va='top',rotation_mode='anchor')
Out[81]:
[Text(0, 0, 'TE'),
 Text(1, 0, 'intronic'),
 Text(2, 0, 'exonic'),
 Text(3, 0, 'true_intergenic'),
 Text(4, 0, 'antisense_TE'),
 Text(5, 0, 'antisense_exonic'),
 Text(6, 0, 'antisense_intronic')]
No description has been provided for this image
In [72]:
 
Out[72]:
0 1
0 TE 142884.155365
1 intronic 148238.420766
2 exonic 6937.294029
3 true_intergenic 15320.140210
4 antisense_TE 8243.353829
5 antisense_exonic 4785.684895
6 antisense_intronic 24165.790583
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [48]:
merged_pas_motif_table['t']=1
merged_pas_motif_table['any_canonic_motif_pres'] = (merged_pas_motif_table['any_canonic_motif']>0)

y_feature = 'any_canonic_motif_pres'

sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(2,1, sharey=True, sharex=True,figsize=(13,5))


feature = 'RPM_quant'

gr = merged_pas_motif_table.groupby(['class',feature]).agg({'t':sum,y_feature:sum}).reset_index()
gr['%'] = np.round(gr[y_feature]/gr['t']*100,2)

ax = sns.barplot(ax=axes[0],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend(bbox_to_anchor=(1.05,1),loc=2,borderaxespad=0,title='expression\nquantile',markerscale=1.5,ncols=2,fontsize=9,mode=None)
ax.set(title = '% of PAS with any canonical motif in (-35,-10) vicinity',xlabel='')

feature = 'phastcon_quant'

gr = merged_pas_motif_table.groupby(['class',feature]).agg({'t':sum,y_feature:sum}).reset_index()
gr['%'] = np.round(gr[y_feature]/gr['t']*100,2)

ax = sns.barplot(ax=axes[1],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend(bbox_to_anchor=(1.05,1),loc=2,borderaxespad=0,title='phast cons\nquantile',markerscale=1.5,ncols=2,fontsize=9,mode=None)
ax.set(title = '')
Out[48]:
[Text(0.5, 1.0, '')]
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [103]:
merged_pas_motif_table.head()
Out[103]:
score class AAUAAA AUUAAA UAUAAA AGUAAA AAUACA AAUAUA CAUAAA GAUAAA ACUAAA AAUAGA phastcon RPM_log2 RPM_quant any_canonic_motif phastcon_quant t any_canonic_motif_pres
0 0.307465 true_intergenic False False False False False False False False False False 0.000000 -1.701508 q3 0 q1 1 False
1 0.307465 true_intergenic False False False False False False False False False False 0.000000 -1.701508 q3 0 q1 1 False
2 6.342241 true_intergenic True False False False False False False False False False 0.000000 2.664993 q13 1 q1 1 True
3 0.023180 true_intergenic False True False False False False False False False False 0.001025 -5.430964 q1 1 q2 1 True
4 1.301714 true_intergenic False False False False False False False False False False 0.009350 0.380412 q9 0 q7 1 False
In [173]:
merged_pas_motif_table['t']=1
merged_pas_motif_table['any_canonic_motif_pres'] = (merged_pas_motif_table['any_canonic_motif']>0)

y_feature = 'any_canonic_motif_pres'

sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(2,1, sharey=True, sharex=True,figsize=(13,5))


feature = 'phastcon_quant'

gr = merged_pas_motif_table.loc[merged_pas_motif_table['RPM_quant']=='q15'].groupby(['class',feature]).agg({'t':sum,y_feature:sum}).reset_index()
gr['%'] = np.round(gr[y_feature]/gr['t']*100,2)

ax = sns.barplot(ax=axes[0],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend(bbox_to_anchor=(1.05,1),loc=2,borderaxespad=0,title='phast cons\nquantile',markerscale=1.5,ncols=2,fontsize=9,mode=None)
ax.set(title = '% of PAS with any canonical motif in (-35,-10) vicinity\n\nwithin TOP expression quantile',xlabel='')

feature = 'phastcon_quant'

gr = merged_pas_motif_table.loc[merged_pas_motif_table['RPM_quant']=='q1'].groupby(['class',feature]).agg({'t':sum,y_feature:sum}).reset_index()
gr['%'] = np.round(gr[y_feature]/gr['t']*100,2)

ax = sns.barplot(ax=axes[1],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend_.remove()
ax.set(title = 'within lowest expression quantile')
Out[173]:
[Text(0.5, 1.0, 'within lowest expression quantile')]
No description has been provided for this image
In [168]:
merged_pas_motif_table.head()
Out[168]:
score class AAUAAA AUUAAA UAUAAA AGUAAA AAUACA AAUAUA CAUAAA GAUAAA ... any_canonic_motif phastcon_quant max_gini normal_gini RPM_log2 RPM_quant t normal_gini_bin num_cs_bins any_canonic_motif_pres
0 0.338847 true_intergenic False False False False False False False False ... 0 q14 0.000010 0.000000 -1.561296 q4 1 (-0.001, 0.1] (0, 1] False
1 0.458394 true_intergenic False False False False False False False False ... 1 q5 0.666677 0.987850 -1.125339 q5 1 (0.9, 1.0] (2, 3] True
2 0.222169 true_intergenic True False False False False False False False ... 1 q3 0.800010 0.771199 -2.170270 q3 1 (0.7, 0.8] (4, 5] True
3 0.023180 true_intergenic True False False False False False False False ... 1 q1 0.000010 0.000000 -5.430964 q1 1 (-0.001, 0.1] (0, 1] True
4 0.867214 true_intergenic False True False False False False False False ... 1 q5 0.500010 0.104058 -0.205539 q8 1 (0.1, 0.2] (1, 2] True

5 rows × 26 columns

In [175]:
merged_pas_motif_table['t']=1
merged_pas_motif_table['any_canonic_motif_pres'] = (merged_pas_motif_table['any_canonic_motif']>0)

y_feature = 'any_canonic_motif_pres'

sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(2,1, sharey=True, sharex=True,figsize=(13,5))


feature = 'num_cs_bins'

gr = merged_pas_motif_table.loc[merged_pas_motif_table['RPM_quant']=='q15'].groupby(['class',feature]).agg({'t':sum,y_feature:sum}).reset_index()
gr['%'] = np.round(gr[y_feature]/gr['t']*100,2)

ax = sns.barplot(ax=axes[0],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend(bbox_to_anchor=(1.05,1),loc=2,borderaxespad=0,title='# of cleavage sites',markerscale=1.5,ncols=2,fontsize=9,mode=None)
ax.set(title = '% of PAS with any canonical motif in (-35,-10) vicinity\n\nwithin TOP expression quantile',xlabel='')

feature = 'num_cs_bins'

gr = merged_pas_motif_table.loc[merged_pas_motif_table['RPM_quant']=='q1'].groupby(['class',feature]).agg({'t':sum,y_feature:sum}).reset_index()
gr['%'] = np.round(gr[y_feature]/gr['t']*100,2)

ax = sns.barplot(ax=axes[1],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend_.remove()
ax.set(title = 'within lowest expression quantile')
Out[175]:
[Text(0.5, 1.0, 'within lowest expression quantile')]
No description has been provided for this image
In [185]:
merged_pas_motif_table['t']=1
merged_pas_motif_table['any_canonic_motif_pres'] = (merged_pas_motif_table['any_canonic_motif']>0)

y_feature = 'any_canonic_motif_pres'

sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(2,1, sharey=True, sharex=True,figsize=(13,5))


feature = 'normal_gini_bin'

gr = merged_pas_motif_table.loc[(merged_pas_motif_table['RPM_quant']=='q15')&(merged_pas_motif_table['num_cs']!=1)].groupby(['class',feature]).agg({'t':sum,y_feature:sum}).reset_index()
gr['%'] = np.round(gr[y_feature]/gr['t']*100,2)

ax = sns.barplot(ax=axes[0],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend(bbox_to_anchor=(1.05,1),loc=2,borderaxespad=0,title='gini index\nPAS with >1 cleavage site',markerscale=1.5,ncols=2,fontsize=9,mode=None)
ax.set(title = '% of PAS with any canonical motif in (-35,-10) vicinity\n\nwithin TOP expression quantile',xlabel='')

feature = 'normal_gini_bin'

gr = merged_pas_motif_table.loc[(merged_pas_motif_table['RPM_quant']=='q1')&(merged_pas_motif_table['num_cs']!=1)].groupby(['class',feature]).agg({'t':sum,y_feature:sum}).reset_index()
gr['%'] = np.round(gr[y_feature]/gr['t']*100,2)

ax = sns.barplot(ax=axes[1],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend_.remove()
ax.set(title = 'within lowest expression quantile')
Out[185]:
[Text(0.5, 1.0, 'within lowest expression quantile')]
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [13]:
list_of_indices = list(merged_pas_motif_table.loc[(merged_pas_motif_table['RPM_quant']=='q15')&(
    merged_pas_motif_table['class'].astype('str')=='intronic')].index)
In [14]:
len(list_of_indices)
Out[14]:
697474
In [16]:
data_full = pd.read_csv('/scicore/home/zavolan/moon0000/intergenic_analysis/result-240228/rcs_motif_check/merged_rcs_motif_phastcon_entropy.bed',delimiter="\t",
                        index_col=None,header=0,usecols = [3,4])
In [17]:
IPA_bed = data_full.loc[list_of_indices].reset_index(drop=True)
In [20]:
IPA_bed['chr'] = IPA_bed['id'].str.split(':',expand=True)[0]
IPA_bed['start'] = IPA_bed['id'].str.split(':',expand=True)[3]
IPA_bed['end'] = IPA_bed['id'].str.split(':',expand=True)[4]
IPA_bed['strand'] = IPA_bed['id'].str.split(':',expand=True)[2]
In [22]:
IPA_bed[['chr','start','end','id','score','strand']].to_csv('/scicore/home/zavolan/GROUP/IPA/IPA_catalogue/SCINPAS_all_normal_q15Expr.bed', sep=str('\t'),header=False,index=None)
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [19]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [63]:
gr.loc[gr['organ']=='eye'].sort_values('TE_%')
Out[63]:
organ project organ_x_project TE_% TE_%_organ color_x color_y
17 eye WongAdultRetina eye WongAdultRetina 54.300 79.7425 (0.1998437102815942, 0.6927711055021983, 0.448... (0.1998437102815942, 0.6927711055021983, 0.448...
16 eye HumanFoveaRetinaScheetzSheffield eye HumanFoveaRetinaScheetzSheffield 72.425 79.7425 (0.1998437102815942, 0.6927711055021983, 0.448... (0.1998437102815942, 0.6927711055021983, 0.448...
15 eye HumanCorneaStemCells eye HumanCorneaStemCells 87.060 79.7425 (0.1998437102815942, 0.6927711055021983, 0.448... (0.1998437102815942, 0.6927711055021983, 0.448...
14 eye HumanCorneaDevelopment eye HumanCorneaDevelopment 89.260 79.7425 (0.1998437102815942, 0.6927711055021983, 0.448... (0.1998437102815942, 0.6927711055021983, 0.448...
In [64]:
merged_num_PAS_each_class_rpm.loc[merged_num_PAS_each_class_rpm['project']=='WongAdultRetina']
Out[64]:
sample organ intronic exonic TE true_intergenic antisense_intronic antisense_exonic antisense_TE total_read ... total_log antisense_TE_% antisense_exonic_% antisense_intronic_% true_intergenic_% exonic_% intronic_% TE_% project organ_x_project
412 10X_WongAdultRetina_RetinaWongscRNASample1 eye 3269.0 2056.0 8869.0 575.0 815.0 30.0 63.0 15677.0 ... 4.195291 0.40 0.19 5.20 3.67 13.11 20.85 56.57 WongAdultRetina eye WongAdultRetina
413 10X_WongAdultRetina_RetinaWongscRNASample2 eye 3599.0 2440.0 10276.0 668.0 841.0 28.0 103.0 17955.0 ... 4.254210 0.57 0.16 4.68 3.72 13.59 20.04 57.23 WongAdultRetina eye WongAdultRetina
414 10X_WongAdultRetina_RetinaWongscRNASample3 eye 1022.0 680.0 2485.0 229.0 316.0 17.0 27.0 4776.0 ... 3.679155 0.57 0.36 6.62 4.79 14.24 21.40 52.03 WongAdultRetina eye WongAdultRetina
415 10X_WongAdultRetina_RetinaWongscRNASample4 eye 898.0 558.0 1989.0 204.0 266.0 10.0 30.0 3955.0 ... 3.597256 0.76 0.25 6.73 5.16 14.11 22.71 50.29 WongAdultRetina eye WongAdultRetina

4 rows × 22 columns

In [ ]:
 
In [ ]:
 
In [108]:
lims = (0,6)

sns.set(font_scale=1)
sns.set_style("white")

fig, axes = plt.subplots(1, 1, sharey=False, sharex=False,figsize=(5,5))

merged_num_PAS_each_class['total_log'] = np.log10(merged_num_PAS_each_class['total']+1)
for category in ['antisense_TE','antisense_exonic','antisense_intronic','true_intergenic','exonic','intronic','TE']:
    merged_num_PAS_each_class[category+'_log'] = np.log10(merged_num_PAS_each_class[category]+1)
    merged_num_PAS_each_class[category+'_%'] = np.round(merged_num_PAS_each_class[category]/merged_num_PAS_each_class['total']*100,2)
    
    ax = sns.regplot(data = merged_num_PAS_each_class,y=category+'_log',x='total_log',label = category,scatter_kws={'s':5})
ax.legend(bbox_to_anchor=(1.05, 1),loc=2,borderaxespad=0,title='PAS class',markerscale=1.5,ncols=1,fontsize=9,mode=None)
ax.set(ylabel = '# supported PAS in a class, $log_{10}$',xlabel = '# total supported PAS, $log_{10}$')
# ax.set(xlim=lims,ylim=lims)
Out[108]:
[Text(0, 0.5, '# supported PAS in a class, $log_{10}$'),
 Text(0.5, 0, '# total supported PAS, $log_{10}$')]
No description has been provided for this image

Drafts¶

In [1]:
(26+28)/1995
Out[1]:
0.02706766917293233
In [178]:
NPV = 0.9999
FDR = 0.46
alpha = 0.06 # incidence in population

A = (1-NPV)/NPV
B = alpha/(1-alpha)
C = (1-FDR)/FDR
D = (1+(B-A)/(C-B))

X = NPV/(1-NPV)
Y = (1-alpha)/alpha
Z = FDR/(1-FDR)
specificity = X/(Y*(1+(Y-X)/(Z-Y)))

sensitivity = 1-A/(B*D)
np.round(sensitivity,3)*100,np.round(specificity,3)*100
Out[178]:
(99.9, 94.6)
In [69]:
98.5 - 99.2, 90.6 - 94.6 
Out[69]:
-0.7000000000000028
In [ ]:
def add_chr_prefix(seqid):
    # Check if seqid already starts with 'chr'
    if not seqid.startswith('chr'):
        return 'chr' + seqid
    else:
        return seqid

def change_dna_to_rna(sequence, direction):
    """
    Parameters
    ----------    
    sequence : string
        a current genome DNA sub-sequence with same length as motif 
        (always + strand because reference genome is always + strand).
        
        we need to convert this DNA into RNA so that we can decide
        whether this sub-sequence is identical to the motif or not.
        
    direction : character
        direction of DNA in which a read maps to.
        
    Returns
    -------
    corrected_string : string
        RNA version (5' -> 3') of the current genome DNA sub-sequence.
        Now you can directly compare it with the motif.
        
        i.e. change a subesequence of DNA into RNA so that it becomes compatible with motif (5' -> 3')
    """     
    corrected_sequence = []
    # if a read is mapping to - strand,
    # revert the DNA sequence and then make a complementary
    if direction == '-':
        reverted_subsequence = list(reversed(sequence))
        for elem in reverted_subsequence:
            if elem == 'A':
                corrected_sequence.append('U')
            elif elem == 'T':
                corrected_sequence.append('A')
            elif elem == 'G':
                corrected_sequence.append('C')
            elif elem == 'C':
                corrected_sequence.append('G')
    
    # if a read is mapping to + strand
    # only change T in the DNA -> U in RNA. other nucleotides stay the same
    elif direction == '+':
        for elem in sequence:
            if elem == 'A':
                corrected_sequence.append('A')
            elif elem == 'T':
                corrected_sequence.append('U')
            elif elem == 'G':
                corrected_sequence.append('G')
            elif elem == 'C':
                corrected_sequence.append('C')
                
    # convert a list of characters into a single string
    corrected_string = ''.join(corrected_sequence)
    return corrected_string

def get_extra_col(L,group, motives_df, fasta_path):
    fasta_f = pysam.FastaFile(fasta_path)
    name = group[0]
    pas_df = group[1]
    rcs_list = list(pas_df['id'])

    for index, row in motives_df.iterrows():
        motif = row['motif']
        # upper and lower boundaries are negative values
        # e.g. upper = -35, lower = -10
        upper = int(row['upper'])
        lower = int(row['lower'])
        # print(f'Motif: {motif}, Upper: {upper}, Lower: {lower}')
        values = []
        for rcs_id in rcs_list:

            chrom = add_chr_prefix(rcs_id.split(':')[0])
            
            if chrom == 'not_needed':
                continue
            
            rcs = int(rcs_id.split(':')[1])
            strand = rcs_id.split(':')[2]
            if strand == '-':
                # rcs + 10
                sequence_start = rcs - lower
                # rcs + 35
                sequence_end = rcs - upper
            
            elif strand == '+':
                # rcs - 35
                sequence_start = rcs + upper
                # rcs - 10
                sequence_end = rcs + lower
            
            dna_subsequence = fasta_f.fetch(reference=chrom, start=sequence_start, end=sequence_end + 1)

            # correct DNA subsequence into RNA so that it becomes compatible with motif
            corrected_subsequence = change_dna_to_rna(dna_subsequence, strand)
            if motif in corrected_subsequence:
                # print('corrected_subsequence match : ' + str(corrected_subsequence))
                # print('motif match: ' + str(motif))                
                value = 1
            
            else:
                # print('corrected_subsequence no match : ' + str(corrected_subsequence))
                # print('motif no match: ' + str(motif))
                value = 0
            
            values.append(value)
        
        # assert(len(values) == len(rcs_list))
        # append a new column
        pas_df[motif] = values
    L.append(pas_df)
    print(', '.join(group[0])+' done, '+str(time.time()-start_time))


def add_columns_serial(pas_df, motives_df, fasta_path):
        
    final_dfs = []
    groups = pas_df.groupby(['seqid', 'strand'])
    with Manager() as manager:
        L = manager.list()
        processes = []
        
        i=0
        for group in groups:
            p = Process(target=get_extra_col, args=(L,group, motives_df, fasta_path))  # Passing the list
            p.start()
            processes.append(p)
            i=i+1
        for p in processes:
            p.join()
        L = list(L)
    final_df = pd.concat(L)
    return final_df

start_time = time.time()
v2_intermediate = add_columns_serial(v2_subset, motives, fasta_dir)
v2_intermediate = v2_intermediate.sort_values(['seqid','start','end']).reset_index(drop=True)

v2_final = v2_intermediate.copy()

motif_cols = list(motives['motif'])

v2_final['all_motif'] = v2_final[motif_cols].max(axis=1)
In [ ]:
start_time = time.time()
deep_intermediate = add_columns_serial(deep, motives, fasta_dir)

deep_intermediate = deep_intermediate.sort_values(['seqid','start','end']).reset_index(drop=True)

deep_final = deep_intermediate.copy()

motif_cols = list(motives['motif'])

deep_final['all_motif'] = deep_final[motif_cols].max(axis=1)
In [ ]:
# check confounding of avg usage and tissue-specificity
In [413]:
data['avg_expression'] = np.log2(data[tissues].mean(1)) # mean of means
data['avg_usage'] = data[usage_cols].mean(1) # mean of means
data['qcut_avg_usage'] = pd.qcut(data['avg_usage'],q=10) # quantiles
In [420]:
data['t']=1
gr = data.groupby(['qcut_avg_usage','ts','segment_class']).agg({'t':sum}).reset_index()
# gr['t'] = gr['t']+1 # pseudocount
gr = pd.merge(gr,gr.groupby(['qcut_avg_usage','ts']).agg({'t':sum}).reset_index().rename(columns={'t':'t_sum'}),how='inner',on=['qcut_avg_usage','ts'])
gr['%'] = np.round(gr['t']/gr['t_sum']*100,2)
gr['prop'] = gr['t']/gr['t_sum']

gr['%_ci_up'] = np.round(gr.apply(lambda x:binom.ppf(0.975, x['t_sum'], x['prop']),1)/gr['t_sum']*100,2)
gr['%_ci_down'] = np.round(gr.apply(lambda x:binom.ppf(0.025, x['t_sum'], x['prop']),1)/gr['t_sum']*100,2)
gr = gr.loc[gr['segment_class']=='I'].reset_index(drop=True) # 
gr['ts'] = gr['ts'].str.replace('False','other PAS').replace('True','tissue-specific PAS')

groups_with_na = list(gr.loc[(gr['prop'].isna())|(gr['t_sum']<100)]['qcut_avg_usage'].unique())
gr = gr.loc[~gr['qcut_avg_usage'].isin(groups_with_na)].reset_index(drop=True)

from statsmodels.stats import proportion as smprop

x_feature, y_feature, hue_feature = 'qcut_avg_usage', '%','ts'
order = list(gr['qcut_avg_usage'].unique())
hue_order = ['tissue-specific PAS','other PAS']
palette = ['green','royalblue']
dodge = 0.2

# reorder dataframe
reorder_dict_x = {}
i=0
for x_val in order:
    reorder_dict_x[x_val] = i
    i=i+1
gr['x_order'] = gr[x_feature].map(reorder_dict_x)
reorder_dict_hue = {}
i=0
for hue_val in hue_order:
    reorder_dict_hue[hue_val] = i
    i=i+1
gr['hue_order'] = gr[hue_feature].map(reorder_dict_hue)
gr_reordered = gr.loc[(~gr['x_order'].isna())&(~gr['hue_order'].isna())].sort_values(['x_order','hue_order']).reset_index(drop=True)
gr_reordered['x_order_adj'] = gr_reordered['x_order']-dodge*((gr_reordered['hue_order']==0).astype('int')*2-1)

sns.set(font_scale=0.5)
sns.set_style("white")
fig, axes = plt.subplots(1,1,sharey=True,sharex=True, figsize=(2.8, 1.1))

# ax = sns.pointplot(data = gr,x=x_feature,y=y_feature,hue=hue,order = order,hue_order = hue_order,palette=palette,dodge=dodge)
ax = sns.barplot(data = gr,x=x_feature,y=y_feature,hue=hue_feature,order = order,hue_order = hue_order,palette=palette,dodge=True)

ax.errorbar(x=list(gr_reordered['x_order_adj']), y=list(gr_reordered[y_feature]), yerr=[list(gr_reordered[y_feature]-gr_reordered['%_ci_down']),list(gr_reordered['%_ci_up']-gr_reordered[y_feature])], 
            elinewidth = 0.5,capsize=0.7, capthick=0.2,fmt="none", color="black")

ax.legend_.remove()
ax.set_xticklabels(labels = ax.get_xticklabels(), rotation=60, ha='right',va='top',rotation_mode='anchor')
ax.set(xlabel='',ylabel = '% of PAS in intronic\nclass')
ax.tick_params(left=True, bottom=True,width=0.5)

out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'over_tissues/', shell=True)
fig.savefig(subdirs['figures_dir']+'over_tissues/std_vs_average_intronic_fraction.png',bbox_inches='tight',dpi=600)
fig.savefig(subdirs['figures_dir']+'over_tissues/std_vs_average_intronic_fraction.pdf',bbox_inches='tight',dpi=600)
No description has been provided for this image
In [421]:
data['t']=1
gr = data.groupby(['qcut_avg_usage','ts','segment_class']).agg({'t':sum}).reset_index()
# gr['t'] = gr['t']+1 # pseudocount
gr = pd.merge(gr,gr.groupby(['qcut_avg_usage','ts']).agg({'t':sum}).reset_index().rename(columns={'t':'t_sum'}),how='inner',on=['qcut_avg_usage','ts'])
gr['%'] = np.round(gr['t']/gr['t_sum']*100,2)
gr['prop'] = gr['t']/gr['t_sum']

gr['%_ci_up'] = np.round(gr.apply(lambda x:binom.ppf(0.975, x['t_sum'], x['prop']),1)/gr['t_sum']*100,2)
gr['%_ci_down'] = np.round(gr.apply(lambda x:binom.ppf(0.025, x['t_sum'], x['prop']),1)/gr['t_sum']*100,2)
gr = gr.loc[gr['segment_class']=='TE'].reset_index(drop=True) # 
gr['ts'] = gr['ts'].str.replace('False','other PAS').replace('True','tissue-specific PAS')

groups_with_na = list(gr.loc[(gr['prop'].isna())|(gr['t_sum']<100)]['qcut_avg_usage'].unique())
gr = gr.loc[~gr['qcut_avg_usage'].isin(groups_with_na)].reset_index(drop=True)

from statsmodels.stats import proportion as smprop

x_feature, y_feature, hue_feature = 'qcut_avg_usage', '%','ts'
order = list(gr['qcut_avg_usage'].unique())
hue_order = ['tissue-specific PAS','other PAS']
palette = ['green','royalblue']
dodge = 0.2

# reorder dataframe
reorder_dict_x = {}
i=0
for x_val in order:
    reorder_dict_x[x_val] = i
    i=i+1
gr['x_order'] = gr[x_feature].map(reorder_dict_x)
reorder_dict_hue = {}
i=0
for hue_val in hue_order:
    reorder_dict_hue[hue_val] = i
    i=i+1
gr['hue_order'] = gr[hue_feature].map(reorder_dict_hue)
gr_reordered = gr.loc[(~gr['x_order'].isna())&(~gr['hue_order'].isna())].sort_values(['x_order','hue_order']).reset_index(drop=True)
gr_reordered['x_order_adj'] = gr_reordered['x_order']-dodge*((gr_reordered['hue_order']==0).astype('int')*2-1)

sns.set(font_scale=0.5)
sns.set_style("white")
fig, axes = plt.subplots(1,1,sharey=True,sharex=True, figsize=(2.8, 1.1))

# ax = sns.pointplot(data = gr,x=x_feature,y=y_feature,hue=hue,order = order,hue_order = hue_order,palette=palette,dodge=dodge)
ax = sns.barplot(data = gr,x=x_feature,y=y_feature,hue=hue_feature,order = order,hue_order = hue_order,palette=palette,dodge=True)

ax.errorbar(x=list(gr_reordered['x_order_adj']), y=list(gr_reordered[y_feature]), yerr=[list(gr_reordered[y_feature]-gr_reordered['%_ci_down']),list(gr_reordered['%_ci_up']-gr_reordered[y_feature])], 
            elinewidth = 0.5,capsize=0.7, capthick=0.2,fmt="none", color="black")

ax.legend_.remove()
ax.set_xticklabels(labels = ax.get_xticklabels(), rotation=60, ha='right',va='top',rotation_mode='anchor')
ax.set(xlabel='',ylabel = '% of PAS in\n"terminal exon"\nclass')
ax.tick_params(left=True, bottom=True,width=0.5)

out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'over_tissues/', shell=True)
fig.savefig(subdirs['figures_dir']+'over_tissues/std_vs_average_TE_fraction.png',bbox_inches='tight',dpi=600)
fig.savefig(subdirs['figures_dir']+'over_tissues/std_vs_average_TE_fraction.pdf',bbox_inches='tight',dpi=600)
No description has been provided for this image
In [426]:
data['t']=1
gr = data.groupby(['qcut_avg_usage','ts','all_motif']).agg({'t':sum}).reset_index()
# gr['t'] = gr['t']+1 # pseudocount
gr = pd.merge(gr,gr.groupby(['qcut_avg_usage','ts']).agg({'t':sum}).reset_index().rename(columns={'t':'t_sum'}),how='inner',on=['qcut_avg_usage','ts'])
gr['%'] = np.round(gr['t']/gr['t_sum']*100,2)
gr['prop'] = gr['t']/gr['t_sum']

gr['%_ci_up'] = np.round(gr.apply(lambda x:binom.ppf(0.975, x['t_sum'], x['prop']),1)/gr['t_sum']*100,2)
gr['%_ci_down'] = np.round(gr.apply(lambda x:binom.ppf(0.025, x['t_sum'], x['prop']),1)/gr['t_sum']*100,2)
gr = gr.loc[gr['all_motif']==1].reset_index(drop=True) # 
gr['ts'] = gr['ts'].str.replace('False','other PAS').replace('True','tissue-specific PAS')

groups_with_na = list(gr.loc[(gr['prop'].isna())|(gr['t_sum']<100)]['qcut_avg_usage'].unique())
gr = gr.loc[~gr['qcut_avg_usage'].isin(groups_with_na)].reset_index(drop=True)

from statsmodels.stats import proportion as smprop

x_feature, y_feature, hue_feature = 'qcut_avg_usage', '%','ts'
order = list(gr['qcut_avg_usage'].unique())
hue_order = ['tissue-specific PAS','other PAS']
palette = ['green','royalblue']
dodge = 0.2

# reorder dataframe
reorder_dict_x = {}
i=0
for x_val in order:
    reorder_dict_x[x_val] = i
    i=i+1
gr['x_order'] = gr[x_feature].map(reorder_dict_x)
reorder_dict_hue = {}
i=0
for hue_val in hue_order:
    reorder_dict_hue[hue_val] = i
    i=i+1
gr['hue_order'] = gr[hue_feature].map(reorder_dict_hue)
gr_reordered = gr.loc[(~gr['x_order'].isna())&(~gr['hue_order'].isna())].sort_values(['x_order','hue_order']).reset_index(drop=True)
gr_reordered['x_order_adj'] = gr_reordered['x_order']-dodge*((gr_reordered['hue_order']==0).astype('int')*2-1)

sns.set(font_scale=0.5)
sns.set_style("white")
fig, axes = plt.subplots(1,1,sharey=True,sharex=True, figsize=(2.8, 1.1))

# ax = sns.pointplot(data = gr,x=x_feature,y=y_feature,hue=hue,order = order,hue_order = hue_order,palette=palette,dodge=dodge)
ax = sns.barplot(data = gr,x=x_feature,y=y_feature,hue=hue_feature,order = order,hue_order = hue_order,palette=palette,dodge=True)

ax.errorbar(x=list(gr_reordered['x_order_adj']), y=list(gr_reordered[y_feature]), yerr=[list(gr_reordered[y_feature]-gr_reordered['%_ci_down']),list(gr_reordered['%_ci_up']-gr_reordered[y_feature])], 
            elinewidth = 0.5,capsize=0.7, capthick=0.2,fmt="none", color="black")

ax.legend_.remove()
ax.set_xticklabels(labels = ax.get_xticklabels(), rotation=60, ha='right',va='top',rotation_mode='anchor')
ax.set(xlabel='',ylabel = 'Motif presence, %')
ax.tick_params(left=True, bottom=True,width=0.5)

out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'over_tissues/', shell=True)
fig.savefig(subdirs['figures_dir']+'over_tissues/std_vs_average_motif_presence.png',bbox_inches='tight',dpi=600)
fig.savefig(subdirs['figures_dir']+'over_tissues/std_vs_average_motif_presence.pdf',bbox_inches='tight',dpi=600)
No description has been provided for this image
In [217]:
sns.set(font_scale=0.5)
sns.set_style("white")
fig, axes = plt.subplots(1,1,sharey=True,sharex=True, figsize=(2.8, 1))

data_to_show = data.copy()
data_to_show['ts'] = data_to_show['ts'].str.replace('False','other PAS').replace('True','tissue-specific PAS')

ax = sns.boxplot(data = data_to_show,x='ts',y='avg_usage',showfliers=False)
No description has been provided for this image